{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyNki3b16cee2Z1bQl47SuGN",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/LC1332/Chat-Haruhi-Suzumiya/blob/main/notebook/%E5%B0%9D%E8%AF%95%E8%BD%AC%E5%8C%96RoleLLM%E4%B8%BAHaruhi%E6%A0%BC%E5%BC%8F.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip -q install transformers openai tiktoken langchain chromadb"
      ],
      "metadata": {
        "id": "Tx3MlbCeen2Y"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://huggingface.co/datasets/ZenMoore/RoleBench"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0C91Et0icLQ0",
        "outputId": "a0b69f7a-0447-4179-dde0-70a4f1177aae"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'RoleBench'...\n",
            "remote: Enumerating objects: 259, done.\u001b[K\n",
            "remote: Counting objects: 100% (255/255), done.\u001b[K\n",
            "remote: Compressing objects: 100% (255/255), done.\u001b[K\n",
            "remote: Total 259 (delta 9), reused 0 (delta 0), pack-reused 4\u001b[K\n",
            "Receiving objects: 100% (259/259), 20.89 MiB | 6.73 MiB/s, done.\n",
            "Resolving deltas: 100% (9/9), done.\n",
            "Filtering content: 100% (8/8), 366.76 MiB | 41.52 MiB/s, done.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "- [ ] 分析desc\n",
        "- [ ] 解析jsonl，数一下token"
      ],
      "metadata": {
        "id": "n-9YnFV9dtuy"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "desc_path = \"/content/RoleBench/profiles-eng/desc.json\"\n",
        "movie_name_path = \"/content/RoleBench/profiles-eng/scripts.json\"\n",
        "\n",
        "# 请补全下面的python代码为我解析这两个json文件\n",
        "\n",
        "import json\n",
        "\n",
        "with open(desc_path) as f:\n",
        "    desc = json.load(f)\n",
        "\n",
        "with open(movie_name_path) as f:\n",
        "    movie_names = json.load(f)"
      ],
      "metadata": {
        "id": "5XgUTj4Vdb7Q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "role_names = [x for x in desc.keys()]\n",
        "print(role_names)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "M15PFFyOvzJn",
        "outputId": "1ddfc5d8-1296-4e7d-f0bc-7b973ea9f78e"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['Doctor Who', 'Mary Sibley', 'Lucifer Morningstar', 'Twilight Sparkle', 'Oliver Queen', 'Angel', 'Queen Catherine', 'Dr. Hannibal Lecter', 'HAL 9000', 'Colonel Nathan R. Jessep', 'Antonio Salieri', 'Stifler', 'Paul Vitti', 'Alvy Singer', 'Violet Weston', 'Sheldon Cooper', 'Willie Soke', 'Gaston', 'The Dude', 'Murphy MacManus', 'Paul Conroy', 'Truman Capote', 'Mater', 'Andrew Detmer', 'Coriolanus', 'Benjamin Button', 'John Keating', 'Wade Wilson', 'Jim Morrison', 'Queen Elizabeth I', 'Coach Eric Taylor', 'Jeff Spicoli', 'Fred Flintstone', 'Freddy Krueger', 'Tyrion Lannister', 'James Brown', 'Walt Kowalski', 'John Coffey', 'Theodore Twombly', 'Gregory House', 'Sonny', 'Colonel Hans Landa', 'Judge Dredd', 'Juno MacGuff', 'Po', 'Klaus Mikaelson', 'Professor G.H. Dorr', 'Fletcher Reede', 'Abraham Lincoln', 'Frank T.J. Mackey', 'Malcolm X', 'Leonard Shelby', 'Harvey Milk', 'Randle McMurphy', 'Jack Sparrow', 'John Dillinger', 'Lestat de Lioncourt', 'Tyler Hawkins', 'Caesar', 'Jack', 'Leroy Jethro Gibbs', 'James Carter', 'Jigsaw', 'John Doe', 'Jackie Moon', 'Sherlock Holmes', 'Shrek', 'Pat Solitano', 'Karl Childers', 'Peter Parker', 'Bruno Antony', 'Seth', 'Caden Cotard', 'Travis Bickle', 'Stanley Ipkiss', 'Raylan Givens', 'Lyn Cassady', 'Michael Scott', 'Robert Angier', 'Rachel Lang', 'Dr. Frank-N-Furter', 'Jack Torrance', 'Tom Ripley', 'D_Artagnan', 'Stephen Hawking', 'Thor', 'James Bond', 'Mark Renton', 'Tugg Speedman', 'David Aames', 'Rorschach', 'Jordan Belfort', 'Logan', 'Judy Hoops', 'Blair Waldorf']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import tiktoken\n",
        "\n",
        "enc = tiktoken.get_encoding(\"cl100k_base\")"
      ],
      "metadata": {
        "id": "MHcRTnq1fLWE"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def get_script_path( role_name ):\n",
        "    return f\"/content/RoleBench/profiles-eng/profiles-eng-{role_name}.jsonl\"\n",
        "\n",
        "\n",
        "def read_jsonl(file_path):\n",
        "    data = []\n",
        "    with open(file_path, encoding='utf-8') as f:\n",
        "        for line in f:\n",
        "            try:\n",
        "                data.append(json.loads(line))\n",
        "            except:\n",
        "                pass\n",
        "    return data\n",
        "\n",
        "ALLOW_SPLIT_TOKEN_LEN = 300\n",
        "ENFORCE_SPLIT_TOKEN_LEN = 700\n",
        "\n",
        "def divide_data_into_chunk( data ):\n",
        "    chunk_data = []\n",
        "    last_act = -1\n",
        "    last_diag = -1\n",
        "    current_chunk = \"\"\n",
        "    len_current_chunk = 0\n",
        "    data.append({\"act_id\":-1, \"diag_id\":-1, \"content\":\"\",\"role\":\"\"})\n",
        "    for d in data:\n",
        "        act_id = d['act_id']\n",
        "        diag_id = d['diag_id']\n",
        "        role = d['role']\n",
        "\n",
        "        split_flag = False\n",
        "\n",
        "        if str.lower(role).startswith('narr'):\n",
        "            current_content = role + \":\" + d['content'] + '\\n'\n",
        "        else:\n",
        "            current_content = role + \":「\" + d['content'] + '」\\n'\n",
        "\n",
        "        len_current_content = len(enc.encode(current_content))\n",
        "\n",
        "        if len_current_content > ENFORCE_SPLIT_TOKEN_LEN:\n",
        "            n = len(d['content'])\n",
        "            while len(enc.encode(d['content'][:n])) > ENFORCE_SPLIT_TOKEN_LEN:\n",
        "                n -= 20\n",
        "            current_content = role + \":「\" + d['content'] + '」\\n'\n",
        "\n",
        "            len_current_content = len(enc.encode(current_content))\n",
        "\n",
        "        if act_id != last_act:\n",
        "            split_flag = True\n",
        "\n",
        "        if act_id == last_act and diag_id != last_diag:\n",
        "            if len_current_content + len_current_chunk  > ALLOW_SPLIT_TOKEN_LEN:\n",
        "                split_flag = True\n",
        "\n",
        "        if len_current_content + len_current_chunk > ENFORCE_SPLIT_TOKEN_LEN:\n",
        "            split_flag = True\n",
        "\n",
        "        if split_flag == True:\n",
        "            if current_chunk != \"\":\n",
        "                chunk_data.append(current_chunk)\n",
        "            last_act = act_id\n",
        "            last_diag = diag_id\n",
        "            current_chunk = current_content\n",
        "            len_current_chunk = len_current_content\n",
        "        else:\n",
        "            current_chunk += current_content\n",
        "            len_current_chunk += len_current_content\n",
        "\n",
        "    return chunk_data\n",
        "\n",
        "\n",
        "    # break\n"
      ],
      "metadata": {
        "id": "D21UUq0WfcY-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(chunk_data[4])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 179
        },
        "id": "mZVhwYINf15-",
        "outputId": "ea4a7c4e-f6b8-42a2-e653-d52765c0882d"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "error",
          "ename": "NameError",
          "evalue": "ignored",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-6-22e7ca1b60a6>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
            "\u001b[0;31mNameError\u001b[0m: name 'chunk_data' is not defined"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!mkdir /content/output"
      ],
      "metadata": {
        "id": "0cFDz4jIgXeM"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "\n",
        "for role_name in desc:\n",
        "    print(role_name)\n",
        "    script_path = get_script_path(role_name)\n",
        "    data = read_jsonl(script_path)\n",
        "    chunk_data = divide_data_into_chunk(data)\n",
        "    max_len = 0\n",
        "    for c in chunk_data:\n",
        "        max_len = max(max_len,len(enc.encode(c)))\n",
        "    print(role_name , ' max chunk ', max_len , ' n_chunk = ', len(chunk_data))\n",
        "\n",
        "    # 创建角色名称的文件夹\n",
        "    output_dir = f\"/content/output/{role_name}\"\n",
        "    if not os.path.exists(output_dir):\n",
        "        os.makedirs(output_dir)\n",
        "\n",
        "    # 将chunk写入文件\n",
        "    for i, chunk in enumerate(chunk_data):\n",
        "        file_path = f\"{output_dir}/{i}.txt\"\n",
        "        with open(file_path, \"w\", encoding=\"utf-8\") as f:\n",
        "            f.write(chunk)\n",
        "    break"
      ],
      "metadata": {
        "id": "GQGET6cslD6P"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "max_token_len = 0\n",
        "\n",
        "def get_system_prompt( role_name ):\n",
        "    character = role_name\n",
        "    series = movie_names[role_name]\n",
        "    original_prompt = f'''I want you to act like {character} from {series}.\n",
        "If others‘ questions are related with the novel, please try to reuse the original lines from the novel.\n",
        "I want you to respond and answer like {character} using the tone, manner and vocabulary {character} would use.\n",
        "You must know all of the knowledge of {character}.\n",
        "\n",
        "'''\n",
        "    original_prompt += desc[role_name]\n",
        "\n",
        "    return original_prompt\n",
        "\n",
        "\n",
        "\n",
        "for role_name in desc:\n",
        "    character = role_name\n",
        "    series = movie_names[role_name]\n",
        "    original_prompt = f'''I want you to act like {character} from {series}.\n",
        "If others‘ questions are related with the novel, please try to reuse the original lines from the novel.\n",
        "I want you to respond and answer like {character} using the tone, manner and vocabulary {character} would use.\n",
        "You must know all of the knowledge of {character}.\n",
        "\n",
        "'''\n",
        "    original_prompt += desc[role_name]\n",
        "\n",
        "    current_len = len(enc.encode(original_prompt))\n",
        "\n",
        "    max_token_len = max(current_len, max_token_len)\n",
        "\n",
        "\n",
        "print(max_token_len)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "mEhMZsPdm6P3",
        "outputId": "86da0ab0-d8d9-4745-bfac-71086ddb581f"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "285\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# from transformers import AutoTokenizer, AutoModel\n",
        "# import torch\n",
        "# # Sentences we want sentence embeddings for\n",
        "# sentences = [\"an apple\", chunk_data[1],chunk_data[2],chunk_data[0]]\n",
        "\n",
        "# # Load model from HuggingFace Hub\n",
        "# tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')\n",
        "# model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')\n",
        "# model.eval()\n",
        "\n",
        "# # Tokenize sentences\n",
        "# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length = 512)\n",
        "# # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)\n",
        "# # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')\n",
        "\n",
        "# # Compute token embeddings\n",
        "# with torch.no_grad():\n",
        "#     model_output = model(**encoded_input)\n",
        "#     # Perform pooling. In this case, cls pooling.\n",
        "#     sentence_embeddings = model_output[0][:, 0]\n",
        "# # normalize embeddings\n",
        "# sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)\n",
        "# print(sentence_embeddings.shape)"
      ],
      "metadata": {
        "id": "n4SZNf0hnl9c"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "encoded_input有三个分量\n",
        "\n",
        "print(encoded_input.input_ids.shape)\n",
        "print(encoded_input.token_type_ids.shape)\n",
        "print(encoded_input.attention_mask.shape)\n",
        "\n",
        "这三个shape是一样的，都是4*617\n",
        "\n",
        "我希望发现encoded_input.input_ids超过M * 512时\n",
        "\n",
        "自动把三个tensor都截断为 M * 512\n",
        "\n",
        "请用python为我实现"
      ],
      "metadata": {
        "id": "WDF6SZQ2tvIz"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import AutoTokenizer, AutoModel\n",
        "import torch\n",
        "\n",
        "_bge_model = None\n",
        "_bge_tokenizer = None\n",
        "\n",
        "def get_bge_embeddings( sentences ):\n",
        "    # unsafe ensure batch size by yourself\n",
        "\n",
        "    global _bge_model\n",
        "    global _bge_tokenizer\n",
        "\n",
        "    if _bge_model is None:\n",
        "        from transformers import AutoTokenizer, AutoModel\n",
        "        _bge_tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')\n",
        "        _bge_model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')\n",
        "\n",
        "    _bge_model.eval()\n",
        "\n",
        "    # Tokenize sentences\n",
        "    encoded_input = _bge_tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length = 512)\n",
        "\n",
        "    # Compute token embeddings\n",
        "    with torch.no_grad():\n",
        "        model_output = _bge_model(**encoded_input)\n",
        "        # Perform pooling. In this case, cls pooling.\n",
        "        sentence_embeddings = model_output[0][:, 0]\n",
        "    # normalize embeddings\n",
        "    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)\n",
        "    return sentence_embeddings.cpu().tolist()\n",
        "\n",
        "def get_bge_embedding( text_or_texts ):\n",
        "    if isinstance(text_or_texts, str):\n",
        "        return get_bge_embeddings([text_or_texts])[0]\n",
        "    else:\n",
        "        return get_bge_embeddings_safe(text_or_texts)\n",
        "\n",
        "bge_batch_size = 16\n",
        "\n",
        "import math\n",
        "from tqdm import tqdm\n",
        "\n",
        "def get_bge_embeddings_safe(sentences):\n",
        "\n",
        "    embeddings = []\n",
        "\n",
        "    num_batches = math.ceil(len(sentences) / bge_batch_size)\n",
        "\n",
        "    for i in tqdm( range(num_batches) ):\n",
        "        # print(\"run bge with batch \", i)\n",
        "        start_index = i * bge_batch_size\n",
        "        end_index = min(len(sentences), start_index + bge_batch_size)\n",
        "        batch = sentences[start_index:end_index]\n",
        "        embs = get_bge_embeddings(batch)\n",
        "        embeddings.extend(embs)\n",
        "\n",
        "    return embeddings\n",
        "\n",
        "test_embed = get_bge_embeddings_safe(chunk_data[:35])\n",
        "test_single = get_bge_embedding(chunk_data[0])\n"
      ],
      "metadata": {
        "id": "iVobDYcRthoZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(len(test_embed))\n",
        "print(test_embed[33])"
      ],
      "metadata": {
        "id": "afLBfxLSx1-v"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [],
      "metadata": {
        "id": "yvXIvHWTydjR"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content\n",
        "!rm -rf /content/Haruhi-2-Dev\n",
        "!git clone https://github.com/LC1332/Haruhi-2-Dev\n",
        "%cd /content/Haruhi-2-Dev"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "utvC4sEJx7AQ",
        "outputId": "bebc5e7a-db3a-47c8-c3d1-66f4c475c148"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content\n",
            "Cloning into 'Haruhi-2-Dev'...\n",
            "remote: Enumerating objects: 782, done.\u001b[K\n",
            "remote: Counting objects: 100% (89/89), done.\u001b[K\n",
            "remote: Compressing objects: 100% (77/77), done.\u001b[K\n",
            "remote: Total 782 (delta 56), reused 27 (delta 12), pack-reused 693\u001b[K\n",
            "Receiving objects: 100% (782/782), 105.71 MiB | 36.93 MiB/s, done.\n",
            "Resolving deltas: 100% (386/386), done.\n",
            "/content/Haruhi-2-Dev\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "import torch\n",
        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "print(device)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "QZP6LlM81nX7",
        "outputId": "6316cd3f-80ae-4979-9f3c-9a3306be63c9"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "cuda\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "\n"
      ],
      "metadata": {
        "id": "GXwGX4Ix18hH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "import openai\n",
        "key = \"sk-WafsA4Ce\"\n",
        "key_bytes = key.encode()\n",
        "os.environ[\"OPENAI_API_KEY\"] = key_bytes.decode('utf-8')\n",
        "\n",
        "openai.api_key = key\n",
        "\n"
      ],
      "metadata": {
        "id": "hcsoOBRv250S"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# def get_embedding_openai(text, model=\"text-embedding-ada-002\"):\n",
        "#     text = text.replace(\"\\n\", \" \")\n",
        "#     return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']\n",
        "\n",
        "# print(get_embedding_openai(chunk_data[0]))"
      ],
      "metadata": {
        "id": "7Q9EWgpF6Sgr"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.embeddings.openai import OpenAIEmbeddings\n",
        "\n",
        "# embeddings = OpenAIEmbeddings(model = \"text-embedding-ada-002\")\n",
        "\n",
        "# text = chunk_data[0]\n",
        "# text = text.replace(\"\\n\", \" \")\n",
        "# result = embeddings.embed_query(text)"
      ],
      "metadata": {
        "id": "Sze6MYzQ6uHc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(result)\n",
        "print(len(result))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 197
        },
        "id": "cCKgtcIE61S-",
        "outputId": "60e6b0af-0714-46ae-d4c3-10f5adc35690"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "error",
          "ename": "NameError",
          "evalue": "ignored",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-12-03198386620a>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mNameError\u001b[0m: name 'result' is not defined"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from tqdm import tqdm\n",
        "def get_embeddings_openai( texts ):\n",
        "    ans = []\n",
        "    embeddings = OpenAIEmbeddings(model = \"text-embedding-ada-002\")\n",
        "    for text in tqdm(texts):\n",
        "        text = text.replace(\"\\n\", \" \")\n",
        "        try:\n",
        "            result = embeddings.embed_query(text)\n",
        "        except:\n",
        "            print('warning!')\n",
        "            result = [0] * 1536\n",
        "        ans.append(result)\n",
        "    return ans\n",
        "\n",
        "# embeds = get_embeddings_openai(chunk_data[:3])\n"
      ],
      "metadata": {
        "id": "y8_hQ4U37XW5"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def package_role( system_prompt, texts_path , embedding ):\n",
        "    datas = []\n",
        "\n",
        "    # 暂时只有一种embedding 'luotuo_openai'\n",
        "    embed_name = 'luotuo_openai'\n",
        "\n",
        "    datas.append({ 'text':system_prompt , embed_name:'system_prompt'})\n",
        "    datas.append({ 'text':'Reserve Config Setting Here' , embed_name:'config'})\n",
        "\n",
        "\n",
        "    # debug_count = 3\n",
        "\n",
        "    # for file in os.listdir(texts_path):\n",
        "\n",
        "    files = os.listdir(texts_path)\n",
        "\n",
        "    for i in tqdm.tqdm(range(len(files))):\n",
        "        file = files[i]\n",
        "        # if file name end with txt\n",
        "        if file.endswith(\".txt\"):\n",
        "            file_path = os.path.join(texts_path, file)\n",
        "            with open(file_path, 'r', encoding='utf-8') as f:\n",
        "                current_str = f.read()\n",
        "                current_vec = embedding(current_str)\n",
        "                encode_vec = float_array_to_base64(current_vec)\n",
        "                datas.append({ 'text':current_str , embed_name:encode_vec})\n",
        "\n",
        "                # debug_count -= 1\n",
        "                # if debug_count == 0:\n",
        "                #     break\n",
        "    return datas"
      ],
      "metadata": {
        "id": "Eigf45fC1uUv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "先处理数据较少的角色\n",
        "\n",
        "将角色根据chunk数从小到大排列"
      ],
      "metadata": {
        "id": "bG2DmkKZ7-Sa"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# role_infos = []\n",
        "\n",
        "# for role_name in desc:\n",
        "#   script_path = get_script_path(role_name)\n",
        "#   data = read_jsonl(script_path)\n",
        "#   chunk_data = divide_data_into_chunk(data)\n",
        "\n",
        "#   max_len = 0\n",
        "#   for c in chunk_data:\n",
        "#     max_len = max(max_len,len(enc.encode(c)))\n",
        "\n",
        "#   len_chunk = len(chunk_data)\n",
        "\n",
        "#   role_infos.append((role_name, len_chunk))\n",
        "\n",
        "# role_infos.sort(key=lambda x: x[1])\n",
        "\n",
        "# sorted_names = [x[0] for x in role_infos]"
      ],
      "metadata": {
        "id": "3pC9dvCG6MSr"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(sorted_names)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rIR88Prn8TVf",
        "outputId": "0fd1bcbf-8e88-40fc-ca31-661d4470be39"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['Caesar', 'Sonny', 'Angel', 'Jigsaw', 'John Doe', 'Freddy Krueger', 'Colonel Hans Landa', 'Gregory House', 'Gaston', 'HAL 9000', 'Mark Renton', 'Coriolanus', 'Oliver Queen', 'Bruno Antony', 'D_Artagnan', 'Dr. Frank-N-Furter', 'Tugg Speedman', 'Stifler', 'Jeff Spicoli', 'Rorschach', 'Paul Vitti', 'Logan', 'Judge Dredd', 'Karl Childers', 'Rachel Lang', 'Queen Elizabeth I', 'Tyrion Lannister', 'John Keating', 'Wade Wilson', 'Lyn Cassady', 'Dr. Hannibal Lecter', 'Violet Weston', 'Po', 'Malcolm X', 'Willie Soke', 'Jack Torrance', 'Alvy Singer', 'Colonel Nathan R. Jessep', 'Andrew Detmer', 'Fred Flintstone', 'Frank T.J. Mackey', 'Stephen Hawking', 'Lestat de Lioncourt', 'Jack Sparrow', 'John Coffey', 'Murphy MacManus', 'John Dillinger', 'Jackie Moon', 'Peter Parker', 'Abraham Lincoln', 'James Carter', 'Tyler Hawkins', 'Stanley Ipkiss', 'Mater', 'Professor G.H. Dorr', 'Juno MacGuff', 'Seth', 'Sherlock Holmes', 'Truman Capote', 'Shrek', 'Travis Bickle', 'Jack', 'Tom Ripley', 'The Dude', 'David Aames', 'Twilight Sparkle', 'Antonio Salieri', 'Judy Hoops', 'Randle McMurphy', 'Thor', 'Walt Kowalski', 'Fletcher Reede', 'Theodore Twombly', 'James Brown', 'Paul Conroy', 'James Bond', 'Queen Catherine', 'Harvey Milk', 'Caden Cotard', 'Leonard Shelby', 'Jim Morrison', 'Pat Solitano', 'Benjamin Button', 'Robert Angier', 'Lucifer Morningstar', 'Jordan Belfort', 'Coach Eric Taylor', 'Mary Sibley', 'Klaus Mikaelson', 'Raylan Givens', 'Sheldon Cooper', 'Michael Scott', 'Leroy Jethro Gibbs', 'Doctor Who', 'Blair Waldorf']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "sorted_names = ['Caesar', 'Sonny', 'Angel', 'Jigsaw', 'John Doe', 'Freddy Krueger', 'Colonel Hans Landa', 'Gregory House', 'Gaston', 'HAL 9000', 'Mark Renton', 'Coriolanus', 'Oliver Queen', 'Bruno Antony', 'D_Artagnan', 'Dr. Frank-N-Furter', 'Tugg Speedman', 'Stifler', 'Jeff Spicoli', 'Rorschach', 'Paul Vitti', 'Logan', 'Judge Dredd', 'Karl Childers', 'Rachel Lang', 'Queen Elizabeth I', 'Tyrion Lannister', 'John Keating', 'Wade Wilson', 'Lyn Cassady', 'Dr. Hannibal Lecter', 'Violet Weston', 'Po', 'Malcolm X', 'Willie Soke', 'Jack Torrance', 'Alvy Singer', 'Colonel Nathan R. Jessep', 'Andrew Detmer', 'Fred Flintstone', 'Frank T.J. Mackey', 'Stephen Hawking', 'Lestat de Lioncourt', 'Jack Sparrow', 'John Coffey', 'Murphy MacManus', 'John Dillinger', 'Jackie Moon', 'Peter Parker', 'Abraham Lincoln', 'James Carter', 'Tyler Hawkins', 'Stanley Ipkiss', 'Mater', 'Professor G.H. Dorr', 'Juno MacGuff', 'Seth', 'Sherlock Holmes', 'Truman Capote', 'Shrek', 'Travis Bickle', 'Jack', 'Tom Ripley', 'The Dude', 'David Aames', 'Twilight Sparkle', 'Antonio Salieri', 'Judy Hoops', 'Randle McMurphy', 'Thor', 'Walt Kowalski', 'Fletcher Reede', 'Theodore Twombly', 'James Brown', 'Paul Conroy', 'James Bond', 'Queen Catherine', 'Harvey Milk', 'Caden Cotard', 'Leonard Shelby', 'Jim Morrison', 'Pat Solitano', 'Benjamin Button', 'Robert Angier', 'Lucifer Morningstar', 'Jordan Belfort', 'Coach Eric Taylor', 'Mary Sibley', 'Klaus Mikaelson', 'Raylan Givens', 'Sheldon Cooper', 'Michael Scott', 'Leroy Jethro Gibbs', 'Doctor Who', 'Blair Waldorf']"
      ],
      "metadata": {
        "id": "z_ZQ5iIuSN4T"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# for role_name in sorted_names:\n",
        "#     script_path = get_script_path(role_name)\n",
        "#     data = read_jsonl(script_path)\n",
        "#     chunk_data = divide_data_into_chunk(data)\n",
        "#     print(len(chunk_data))\n",
        "#     break"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bCQ47G7G8bUN",
        "outputId": "dd237436-a7fa-40ac-9fd0-c4a70c631e64"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "13\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from ChatHaruhi.utils import get_bge_embeddings_safe\n",
        "\n",
        "# embed_openai = get_embeddings_openai(chunk_data[:])\n",
        "# embed_bge = get_bge_embeddings_safe(chunk_data[:])\n"
      ],
      "metadata": {
        "id": "CW86odT186_t"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# datas = []\n",
        "\n",
        "# system_prompt = get_system_prompt( role_name )\n",
        "\n",
        "# embed_name = 'luotuo_openai'\n",
        "# embed_name_bge = 'bge_en_s15'\n",
        "\n",
        "# datas.append({ 'text':system_prompt , embed_name:'system_prompt', embed_name_bge:'system_prompt'})\n",
        "# datas.append({ 'text':'Reserve Config Setting Here' , embed_name:'config', embed_name_bge:'config'})\n",
        "\n",
        "# from ChatHaruhi.utils import float_array_to_base64\n",
        "\n",
        "# for text,embed1, embed2 in zip(chunk_data, embed_openai, embed_bge):\n",
        "#     encode_vec1 = float_array_to_base64(embed1)\n",
        "#     encode_vec2 = float_array_to_base64(embed2)\n",
        "#     datas.append({ 'text':text , embed_name:encode_vec1, embed_name_bge:encode_vec2})\n"
      ],
      "metadata": {
        "id": "t1lcKD4p9Qos"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# print(datas[3])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-lEUR7mk--kO",
        "outputId": "b1c6cb40-a38f-40d9-e627-003de77fea6a"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'text': \"narrator:STEWART He's right on target. Height. Weight... She checks Caesar's ears.\\nCaesar:「Dairy Queen.」\\nWILL:「After. Sit still.」\\nSTEWART:「What about diet?」\\nWILL:「A healthy diet. Fruits, vegetables.」\\nCaesar:「Ice cream. Will ignores him.」\\nSTEWART:「What's he saying?」\\nWILL:「He's hungry. Caesar's angling to climb off the examining table. Will sends him a look - out it out.」\\nSTEWART:「He seems extremely intelligent. How many words does Caesar sign? Beat.」\\nWILL:「You know, the basics.」\\nCaesar:「Soft-serve.」\\n\", 'luotuo_openai': 'vHNa1LxkZe08C1U/vHiHHDynVaQ8voAfPAhovbziX7q8EjFcu+VpBj0oHyo9KB8qvHgT9DvvwZY8GhCSu0qIhD0RZ9Y6QC/0PICJh70nrAI6DOhKvDF0obxQJ/S7FTqoOhP9+roBGXg82p1Ou8I2Jrv6jU67cjrxPP+AAztMVSO7sBsqvMEWQ7x/Rm+8iWvWvEe4zL0j9fs7mmbwu7F0oTvNkdC7nFBYPD/2YLwEIsS8eIccuvvmxTzm/BG8glYlu5wzjjteGcI8pPkUPH2zZLzPtMw8I0k/PEM5PzxOPos8y/7FvFJK8LuG1bK8vfAtPC5rVTyGYoq8spSEPEecArwoWL28RALFuohovTwavU27ZWkGPQQixDwr8fs8yUvXu/R6t7wqe7o9FwdGu7Dkr7poVYg76iInN87rRzwwqxu77JuBvD1Dcrwv4ZY9MZFrPJ2NBTx0QSO8iEvzO90WqLyXtAI8MpSEvEV5Bjw9Q3K7N/pgPQrFTbuvxMw8pU9yO/egzTwyWvC8Oq1OvPyTgTxRgWs6q9Uxu0EWQ7uk+RS7pxwQu/zNFby7PT87JIXsu4kyQrzShIS8TQHePGnokzw9Q3K8kS5Du/vmxbqSh7o82X1rvMJvur0BGXi8qOivPIi/GzywVL08PvNHuttKCjukorY7/XnQvFc9pbysZSK8n+mWu7KUhD0ZDXm8fQapPJWRBbxL/sW8hZkFuozoSryRLkM8fwzbPGk717yGf1Q8MuriPA/xlbwH2Ms6dZqavOLvrDurYgk8IilcPLPt+7w1gQa8aa7/O5G+NLxrCHY8lXQ7OUZfVTv0B489N6QCOvLnrDxR18m8IvLhOZ2pz7wKUiY8jggtPFwTj7yxrjQ7ncaZPQyutj0lbDy74l+6u24ujDlpdWs7XYnQPQBP87w98C083MBLu6OfnTzmwn07raHPOpxQWDtGtbO8EjFcuYkVeD0H9ZU6y4udPG7bRzvcTSO8zwgRPJswdbyaZvA8he9jvKjorzzNdQY8+eCTvBUd3rqWWou/ItYXvJ8DRju4bYc8BSXdPIGpaj0zB6u6NzDaPKTcSjtE6RQ8/CBZu8nbybxIZYi8KXihu3Cn5r0EeSK75+JgO6WJBb0Ivxu5scr+Op6QH7zgAyo8lFRYvMBMvjxhP9c8/iaMPIw7jjxfVm+8mi1cuzUN3rstEd69E8RnPRlj1ruwGyo8s5edPVkKQ7uU5Eq8eBP0PJlj1rwS+uE8NERZvNbnR70KxU08q9Uxu48LRjtL/sW7lBrEPCnO/jwTp527Af/IvNAn9Dvb2fs7YK/mPBJq8DsylIQ8WAcqu7HK/j0v/mC8v4M4O3OUaDyZDXk8JYkFusJvurzATL48PLOAO5EReTz4hxy8xVw8PM+YAjy9Q3I75EkjO7yWtjxC/6y8RALFO7GuNDxrQgo76FWIPStiCbvJEkM8wv+sO3kz1zuqe7q7/FntvDJa8DwpsjS8XcNkPAhovbw33ZY8VaqaO/Tt37xsm4G7lZEFPKZSizsLG6u8sKsbO/b0ETv3oM27tkqLvF5wHzy2vbO9ERF5uuoiJz0Ivxu7g3YJPDgXKjyHZaQ8qpiEvKsodTzBbKE8kxervJ45wTxoVYi75S9yvQbyfDtdFqi7ledjvM1YPD0FQqc8bNUVvMr7rLyaEJI8h/WVPPZHVj0BxjS8cG5SOxWtzzzZ08m8bqGzPAOvnL0B/8g7m8Bnu34mjLwMAfo89LRLvBArKToxAXk8P4M4PCp7urydGd48eBP0vFVxBrv9Bqm7pfwtPLdqbry4M/S8lyQQvMPJMTxOBPe7vCOPvNIuJrygP/O8AMMbvIzoSrx7AHa78nSEvErCGDvR18m8YSMNvNJK8Dl+00e8hZkFOy6k6TycM469QGmIOxVXcTuMdSK6zFUjPJ4ALbyQKym8lndVvPwgWTvsYe28M5edPKyetjzkn4G8MQF5PINZP7xXBBE8PSaoO/pTu7uX0My8Wbb/vH8M27z/DNu8mEPzPOLvrLrv+yo8nnNVPS2hz7xlEqg8lXQ7vLSatjsPRNq7Ao+5vJ882jzFBd674Ol5O4dlpDvgdlI8apVOPEbvRzy112M8UJscPQzoSjuuwbM8xVw8vJTkSjwEIsS8w8kxPHOUaDsuMcE7hywQvJHa/ryrm508WmO6O+J8hDu1ndC8t/pgPQGpajvShIQ7Aalqu5Fn1jp/Rm88iBJfO2OcZ7zEkrY8vgz3PGu1MTwMy4C8h0javIAWXzuGRcG8jVtxO7/ZljvaRvC7KMvlvPI68Tyy6uK85NkVO79mbjuztGe7iRV4PLTxFDyGRcG89EEjPHgT9DrZQ9c85A+PPIQ/jryc4Eo8tfQtPISV7Dvv+yq9BQkTPIpSJjwxdKG71OEUPGRl7TsMkew8NZ3QPO/BljuvqAI8U937OZm6NDx/DNs8gnLvu5yJ7LyWzbI7IM/lPAGMoLsxAXk8YtLivAf1lbklpc88EJ5ROuvuxbuBUww8DrToOzR97Dx9s2Q8v6ACuwuO07zmiOk8vkaLPIH/yLugsxs8RnwfOuDMsLvOsbO8lpQfPOBZiLwi8uE8slrwvFqAhDwp68g6v6ACPJOnnTvcMFm79AePPA/UzLo8lrY8mNPlvEs1QDuPuAK9CBJfPH4mjLyl/C29JNxKvMa1szy7PT+6sne6vBmdarqfdm68pRXevEVcPDuua1W7OePIvKbFs7xqzuI9EIGHPCDsr7zEH4+8QhlcvJSN7LzJvv892dPJPS6IHzxsKFk8a+7FO7S3gDp8zRW77YHQO2K2GD0JpWq8WZo1vHegzbpy56w8aeiTuzYQ9zx3LaW7nOBKu45Bwbyb+fs70r4Yu942i7zYQL48qgiSPF0zcj0mNcE6FVdxvJlj1jxgPL483eAuO8kSQ7xboGe5VXEGu5tNP7tc+d48rEhYvMoVXDwmb1U8G/n7O+LS4jy4M/Q4N02kPEM5Pzxs1RW8OjomO17jRzuZKkO81DRZO8pruj0mqOm81XEGvIAWXz0OCC267/squ43rY7x4+kQ8L1GkPKvx+7yXem68raHPuYyutjzCGVy8gBZfO2kCQ7tk9d656XVru9cEEbxgH/S8vX0Gu8BpiLyRLkM8hM+Au0vh+7wplWq8bi6Mu60uqDwBNkI8QoyEPILmF7yiuU68sR5DvFG6/7wYfYe81P3evNVxBr1S2uK8q2IJPI7RsjwYJym7ZIK3vPS0Szxp6JM8NH3sPNThFDsb3TG81VQ8O6S/gLxprv+7XRaovEre4jxZfWu8RZXQOx1Tcr0I2+W8xHXtu40FFDzJojW8g6+cPBOnnbtanU68he9jvEOsZzw8lra75k9VOr0mqDvRDkO6mLcbPNT93jx0B488Gr1NPDztFLq/oAI9CWvWvMoyJjzJvv87im7vvHJ0hDzXINu7wL/lO5kqQ7yVkQU8BAX6PAmlarsjZgk8mfPIvKnryL0W6nw72Zo1vK/EzDybMHW8JjXBu5ftljwPmzg7KkImO/PN+7xfABE6gTZCvLjD5TnYQL48ABZfO7j9eTu3MNo73wARPCCWUbw5cKE8nanPPP2zZL0IEl+8AMMbPEL/rDzPCBE8KrVOPMFsoTvFXDw8mzB1vCao6bz6Gie7pL+AvMC/5bz29BG8vJa2umNGCjy+nOm4zutHu8tSCTyzJHW8G/n7PQqogzyQ1+W7tGEivRP9+r1NWDw7rA7FuklL1zuh78i76wh2u4wexLy2hB89BkXBu9lD17yec1W8GGC9PFTESrzvTm87yfiSvMV5BrvBwv+6OwOsvPKuGDxLUgm6lBrEOdXkLjwTUT88qXihvBY9wbrTh527wPl5PCFDDbtwblK8XaaaO4Hi/ryaoIS7uyB1vIyR7LyNlQW8tr2zvCJ/uruif7q8as7iPColXLtrQgq80xR2uYV8O7v3oM09DQUUvHNa1DzWdB889rp9PIriF7xSZ7o7gqyDvG27ZDtcMFk8QhlcO7MHqzzoVYg8F5c4vDnG/jw+KcG8la3PvXNa1Dtauhg8xkKLO/YNwrzEr4C76s7iurBxhzyZnWq8g1k/vGJC8Lz1YQa8gcY0vICJh7vM5RS8W6BnPNMxQDuNBRQ7xwwRvLt207zs1RU7RbKaOjgz9DwTbgk8h2WkvNsQdjwztGc7Fj3Burt20zwoHyq8UGGIvFOHnTz8k4G8THHtvL8QETuJwjS9A3YJOxZ3VblKTvC5gKZRvEwbjzspW9e8i3IJPILmFzyN62O8zpTpuyxIWDvrtTE8BywQvJTkSjuKqIM8q2IJvL/Zlr0zXgm8gjlcO1UaqDuGnB48EffIvRBH87t32mA8rk6LvKccEDzLGHa9A3YJPH6Zs70AT/M8RXkGO4Jy7zszXgk9KbI0vBlj1rtr7sW843+eO9ThFDpKFVy7bGHtPFc9pTxfOaU8XnAfPHegzbt7c568xewtux5zVbu87RQ6Yu+sPIhovbyv/mA8Ax+rO+RJI7vaKia7iqiDvQw7jruKGJK9GkomPRZ3VbwI+K87WZo1vCF8oLyZnWq8galqPBqDurvWkOm8jkHBPPsAdjzxx8k8NoQfPI8oELsQKym9BOxKu5/MzLxJEkO9Iyx1vFm2/7mOtOg7nOBKPHIBXby5jWu9DDuOPCHvyDwsK487BXw7u8fVlruGRcE9BLK2vC2hzzwYQ/M8+E2IOzRhIjwMAfq8aFWIvLTxFLuLVT+8n8zMvNG6/zwMdSK8Azx1uy802jx11C67NoQfPKbFszxYs+Y8d6DNvDuwZ7tQCyq7fCBZOoAzKbuX0My8rxgQuzW6mbp4TYg8G6OdubVHcr0yWvC8ADMpO5C7G7w+Y1U7l12kvJsTq7vK3uK8lVdxO6X8LTywGyo8JGkivPA0vjyZY9a7iou5vKO8Z7xAv+U8yoiEvAPpMDsfWaQ815QCvB02qDwEXFg8kGS9PMU/crzSoU48gBZfPBnW/juS+uG8wYlrPNugZzuiRia8sgSSueaI6bwWIPe7tzDau1Pd+7zeU1W4ZS9yvI1bcbyL5TA6VldVvLvNMbyYmlG8TMhKvHngk7rVx2S8AuYXPn5gILnnNaW8G8BnPSJi8LtUxEo8ZdwuPU8IETkI2+W8DDuOumbCfbvk9d48GPCvvJEuQztBbKE8x7jMvNc9pbygXL28+BP0PHOUaL1PtMw8VwQRO6IMkrwkv4C7lHEiPSCWUTvwp+a8k1E/u2OcZzyO7nw8rRHevJ+wAruEP466kk4mO08IEbxISL68JzjaO0zlFD0oO/M7lBrEPALJTTySTia85285u7rKGLz3LaW7kIGHusUiqLup68g7tArFvETMSj0BUwy84c/Ju5LBTTzPtMw9EWfWu7EBeTx3oM08wd/IuukCQ7wm/0a8dO3fvCRMWDzrQgq8rWg8PK7efTseVos8h9jLvJ45wTyQDl+7fXnQvIlPDDzHRaS9MI5RvAzoSrri76y9RB+PvRPEZzy902M8I9kxuwiiUTy1gQa8jpgePM3oLryG1bK7zltVvIPpMLzkgrc8wKMbPG4ujDxqIie8y6hnO61oPLzN6C45PdNjO8rCGDxBbKG78G5SvKbifDyZgKC8Du58vDt207wiYvC78OF6PIGMoDznbzm7icI0u8w4WbxI2K88xK+APIKsg7yhmWq7kYSgvO5oHzweVou5mbo0O2JC8DsDPHW78uesvBswdTzOzn29Gbo0PI0FFLw7PT88OlbwPPxZ7Tv0tEu8yWihvJ3jY7w8QFm7Mcr+vIHGNDwFCRO9ECspumtCCjwiDJK8BD+OO/zNFTyL5TA7Fup8vLj9ebugllE7qiVcu7rKGDvGX1U7apVOPS77Rrz6Gie74ZY1PBVXcbwoy+W7ePpEvEVcPDwOCC28iou5PAW1zzuqQia8gwLhvM+YAr0iDJK8pWw8PBsTq7z4E/S6CU8MPJWRBbzWHcG8fiaMvL7zR741Kqg8yzVAPOfiYLzBwv88lFRYvIpu7zy8QFm7XnAfu86xszxVcQY7YK/mu3A0vrsyBJK81pDpu+Hsk7umjB+8+IccOzIEkjwfIBA9AuYXPSLy4byQnlE7EIGHuof1lTyu+0Y9A+kwOwvIZjwx58i7Ao+5vDHK/rwlwpk6+W1ru9maNTunVaQ8yWihuy0uqLxkn4G8hJXsvRGEoDznNaU8r8TMPUXPZDzCNia8RSKou3dnOTyLcgk8LNhKOxL64TzDj528rsGzOzwjj70VkQU8iPivPMtu0zwNId48BkXBvFLa4jzhz8m8lMeAvJGEoDo7dtO9DkHBPJH3yLyrRT+74XlrvNd3OLzyrhg8BgwtvG8U2zvwNL666s7ivNdabzyrftO8J3JuvAvIZjugsxu8oe/IvM8k2zzBbKG8W70xu3xZ7bupBXm8obY0u6GZajx3LaU65fj4PBqDurtf5mA80JscvLtaCTx0QSO8hn9UvPjAsLwHvAG7+E2IO8CjG7us9RQ8uMPlO6K5TryzQT+8IilcOhsTq7yVdDs8awh2Oxv5+zypsjS6GqCEPIriF7wSavA83cNkvAyR7DxNkdA8dvQRPF9zOTwTbgk9DDuOvQ8LRryGRcE9BFxYPCkFeTzDj5073lNVvH4mjDyVOqi7f4ADukH8kr3xjjW82roYvGnokzy80Eo7txQQPHkz1zraRvA6T+5gvS4U9z0AT/O87YHQvSdVpLtVqpo8MpSEvFSKtztOW1U8Qd/IvKJGJrwFfDs8kzR1OgD8rru9YDw7/XnQu7gz9DxdFqg8ifvIvLWBBjrB38g8jZUFPI+4AjwoO/O8lcqZPLdqbrxGQou7llqLvNBhiLxZQ9c8t92Wu1jQr708eew8obY0PI0FFLxhXKG9FyQQPBIUkrzEPFm8kzR1vALmF7y+gB87PCOPvL6c6TxTwTG8XPneO1+szDvLqGc7hdKZPEYlwbxO60c8jiT3PFDxebv0QSM76s7ivJ92bjrMjrc8FQEUPDXXY7zR9JM7xu9HO4Sytry+80c7ClImPH+5lzweOcE9JIXsvK+oAru5xv68T5gCvL+gAjyY0+W8vAbFvEMcdrztDqg8x9WWvP2zZD0UVFg8sZFrvA0+pzu4UL4736zMvAqogzuXem48SjImPTcw2rv2un27yjImO9PBMbvvFNs71pDpPCY1wbzwp+a8obY0vAe8Ab1S96w9JRXevOvuxbyG1bK7y8UxvDpW8LunVaS7vgz3vERZIzzxGw29ImLwPKG2NDug7K878Y41vJCBhzvQt+U8IHmHu+oiJzwGYoo89rp9PJWRBTyLVT872/bFOlXkLjvk9d47suriupKHujw+KcG8QGmIvLF0oTsu3n28X5ACPBRxIjy/Zm47ODP0vDPt+zsxrjQ6f/Mqut6pszyHvAG79y2lvL1gPDvzlGi7srFOu+B2UjwZ1v69BZkFPGnokz0ZDXk8c5RoPKB5hzw7zTG6Vsp9vJCeUbwFCRO7jHUiO/EbDTqKi7m7KFi9vDeHODylpc+8g3YJu2DpebwdjQU8Oq1Ou4oYkje4bYc7MpSEu8I2Jr0Uqra7Cm7vu5zDgDwgXL26GtoXPLa9szxAoxs73MBLujlwoTo80Eo9Bn9UPA8LRruLOHW72/bFPH7TRzzEWSO7D35uvCWlzzwEz4C8ZzWlO8GJazuiDJI79g3CORvAZ7vBpjU8gE/zPI2VBbz0ere8hgwtPLS3gDyinIS7sj4mPFmaNbupBXm7p+WWO/vmxTiFX3G8Kc7+vRIxXLsthQa8B/WVO04hwTwWd1U88DS+PMMcdryaZvC8T15uvLaEH7tFspq8tGEiO7REWTxjDHa8cVShPGvuxTxZCkM8Bw9GPB2pz7roVYi7VDRZO4GparyYYL08OsoYvOX4+DvPCBG8lQEUOxBH8zvfyZa8wGmIPGbCfb0QgYc9bChZO27bR7u2EPc6SoiEvLuwZz0Z88g8SdvJvNWqmryBjKA8RHXtPBRUWLy5cKE5xu9HvJraF7vQ8Xk7rS6oO8YlwTyfdm68qAJgvJJOJjyTUT88f4ADO5q9TTwXXaS8O1oJOtJnujzEPFk7vJa2Oqt+070kTFi8F9DMPGNGCrySwU28rJ62uiyB7LqT/fq8GJpRO9dabzziQvC78Y41vI3rY7ryAV28xc9kvFm2/zvY0K+8geL+O/J0hLyoWL29Azx1', 'bge_en_s15': 'vYACejyFVWw8GTOpPInva7sd1Yw7PvhWPZwykz0Nh8g87CCevBAg5zq9lZu9AnxkvOheUTzkoH49ZdqXvDAaVDyONFg8wOozvSmE4j2KO1M9Ki+ivXtI47ybpYm9AjMkurxx6ru4qXG7ICsEvPZHur0nOju+QYtpOzhcCLvr3/89kr9IvVKlrLyDI2s8vqDiPECAYzwXoTS81F67PYDPQz0KwbM7307AvOhYA7rxBYA87toGvQ9M5L0lCW66X0V9Pdf4EDysAnq95TfCvTIS3ru9r469IA/2OuPLWj2cUPk90tQCu8ynSD1sqh89HrFjO4NhYzyjzeq92D2SPZ1ZcT0DmgQ9GKFHvXrc9Ts3dDm8wRC4Paf1Lz0CWoy9dCQ9O7BxHTu3vOC9O1qKPJAjIzywwnS7WHZuPK1tYTysZsO9qFvCPWpzYr1/h1S8njdKvMEApbxVlQK8WHPlPDM6uDvFpZO9N4U2u6VD8L1LBCS9Q3awPETkDb2VQ5+9F8chvLwYJL0052e9ji58PsU0Ijzdypa9KzRkOzH9Pj0A9dI88PBQu127sT03Bna9UgvoPa2fPLxd1B06quH3vAM5Hj0FdLC8KNMXPYUH2D2XVVA96dJePA5SWLxUPTa8+vI2PXa90DyJLBS6mkCFPJOcjT1HAYq9SZedPdCu7T1edr69U7i6PBig5z0g0iq8FyoEOz9RZzyP+Q08vvNKvDczSz1ZGuK7NIjyPRwoB70sQ7S8D1ZUvSFq8r0ZBE29RnuSvLwGNj2Ym889PbFWvQK8Qb0Nyz48wJK0PTrSNjvLQJI9pgoBvSQUWbzceRU6mtyevJsozzsqsE69LJxfvFoZNT0fSOq9BG3HvJ7GIzyqTes9NDwKvOIHK71+3Ki7BlcAPIxAvb06EJg9nnPivJcNwL3JJtQ87G6Iu9vDZjzTslW9VGs8PQDLiDxR3OQ8wz3iOwjqD7zmblW8ojNwPVzaTb1KI6q8mzrqvFG7fr0drBk9S43BO8eJJz1qpa09F5/4vcGkpD0F2ii93UA6vY3gXjxxads7yN9MvYukZjxthj29/WvHvaokRb099Co9T2dYPR6EV7x/x2Y8IKO8vLmHej2Z/ou7AE6HO+ljtTzpDTK88f4NPFney70WTQw8hpqRPWsWtz0r1KM8USABvMUxsTylFnA7mj3iPLtF876s05U8x4eePCuOmrzlQCI7Z0kWvOzhLj1q/3G8wjPLPXwD/TuQv/c8yi4rvQ7En7yZA1S8gMlRO+Xvfj0R5po7cRYOvM93HbuBMbw9MlPlPRVp9TzoIuC8ac7fvVnXgrvC2qi8p5oAPgcDbT2yu807s0jMvDau7TshT+y7qoglt6NpfL15k6g9SkQPPSmASTxVoEe82xSwu58A0Dn7jiq9Jal1PWNQrL0JGn67obyWvYmANTmkdAQ710lFPJZxmj0+GMo9MoH7PPpuTj0mPDi9JqzMPIZYYztCJCa8h2AnvWfmGbx/v8q9NrypPYFz671QVq09OSSgPBm3T7wNxYk7e5w2PCDdwjyPn9G9VV+dPIn8Vzw5FZ69RaHCu9pb2zpU9x29nYb3OurIPzvB+Fg7r0c8vX7OLb2pu628yR9YPIpSNj0dS+68AxzRvOqBTTzQHA29OFavO/TCurr19Og9wf2GvDDSIzzeKMk9IPGivG5IZbywwyg9g1R5PQfWur6SWBs9Q+wcvHib5jzNmFY7QXiPOyft0TrqohS86gJHvZxoILuQotw9kFqVvMzANDubhLA7pE7lvIV35Ly2hLM9b3bXvJ478byksg47j0PlPKk1fjuJiZ49sQw3PLkkdrwQXKy8KLDHvVChz70Ewpc8vK0zPX/0Pzrsbdy6Nq4pPRqoRjwuWiY7pp5XPKxeP7wDqW09ANqgusNHcTvP/PO7nsROPDf2GTwbEQO9ZwQgOp9tRTxr3i08S9RCvDjJpLu2FbO9AXj/vHtBLb2oCeA8dAGDvIWJkj1F9Kc8gq/Fu8Pvt7xyb5o9IAeuPHk7q71avt+9cJZpPclPRD0SF6a7keoN'}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "7i4apcLV_lsL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xd9bSvrA_9x5",
        "outputId": "f7f4390e-a28e-46dd-b184-af9c9faf4784"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "\n",
        "!ls /content/drive/MyDrive/Role_from_RoleLLM"
      ],
      "metadata": {
        "id": "p0r0rF69AEZe",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "143040b5-4e3f-4f3f-997f-6d9f4e85a7d1"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Caesar.jsonl\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "import json\n",
        "\n",
        "def write_jsonl(datas, file_name):\n",
        "    with open(file_name, 'w', encoding='utf-8') as f:\n",
        "        for data in datas:\n",
        "            json_str = json.dumps(data, ensure_ascii=False)\n",
        "            f.write(json_str+\"\\n\")\n",
        "\n"
      ],
      "metadata": {
        "id": "n8nGcwU2AZSK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "for role_name in sorted_names:\n",
        "    script_path = get_script_path(role_name)\n",
        "    data = read_jsonl(script_path)\n",
        "    chunk_data = divide_data_into_chunk(data)\n",
        "\n",
        "    embed_openai = get_embeddings_openai(chunk_data[:])\n",
        "    embed_bge = get_bge_embeddings_safe(chunk_data[:])\n",
        "\n",
        "    datas = []\n",
        "\n",
        "    system_prompt = get_system_prompt( role_name )\n",
        "\n",
        "    embed_name = 'luotuo_openai'\n",
        "    embed_name_bge = 'bge_en_s15'\n",
        "\n",
        "    datas.append({ 'text':system_prompt , embed_name:'system_prompt', embed_name_bge:'system_prompt'})\n",
        "    datas.append({ 'text':'Reserve Config Setting Here' , embed_name:'config', embed_name_bge:'config'})\n",
        "\n",
        "    from ChatHaruhi.utils import float_array_to_base64\n",
        "\n",
        "    for text,embed1, embed2 in zip(chunk_data, embed_openai, embed_bge):\n",
        "        encode_vec1 = float_array_to_base64(embed1)\n",
        "        encode_vec2 = float_array_to_base64(embed2)\n",
        "        datas.append({ 'text':text , embed_name:encode_vec1, embed_name_bge:encode_vec2})\n",
        "\n",
        "    save_name = \"/content/drive/MyDrive/Role_from_RoleLLM/\" + role_name + \".jsonl\"\n",
        "    write_jsonl(datas,save_name)\n"
      ],
      "metadata": {
        "id": "LIpknjOMAhwI",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "0c07e464-02e8-4798-ec3f-75e16dad3ead"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "  8%|▊         | 1/13 [00:00<00:01,  8.45it/s]WARNING:langchain.embeddings.openai:Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: Internal server error {\n",
            "    \"error\": {\n",
            "        \"message\": \"Internal server error\",\n",
            "        \"type\": \"auth_subrequest_error\",\n",
            "        \"param\": null,\n",
            "        \"code\": \"internal_error\"\n",
            "    }\n",
            "}\n",
            " 500 {'error': {'message': 'Internal server error', 'type': 'auth_subrequest_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Fri, 20 Oct 2023 04:13:42 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '166', 'Connection': 'keep-alive', 'vary': 'Origin', 'x-request-id': '64d051740109e038c8f31c5bea2aa30c', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '818e6de34f787ba4-LAX', 'alt-svc': 'h3=\":443\"; ma=86400'}.\n",
            " 23%|██▎       | 3/13 [00:09<00:30,  3.04s/it]WARNING:langchain.embeddings.openai:Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: Internal server error {\n",
            "    \"error\": {\n",
            "        \"message\": \"Internal server error\",\n",
            "        \"type\": \"auth_subrequest_error\",\n",
            "        \"param\": null,\n",
            "        \"code\": \"internal_error\"\n",
            "    }\n",
            "}\n",
            " 500 {'error': {'message': 'Internal server error', 'type': 'auth_subrequest_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Fri, 20 Oct 2023 04:13:51 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '166', 'Connection': 'keep-alive', 'vary': 'Origin', 'x-request-id': '9d6a18997d823d5f6b69618fd6015d7d', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '818e6e1dbfdc7ba4-LAX', 'alt-svc': 'h3=\":443\"; ma=86400'}.\n",
            " 31%|███       | 4/13 [00:18<00:49,  5.47s/it]WARNING:langchain.embeddings.openai:Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: Internal server error {\n",
            "    \"error\": {\n",
            "        \"message\": \"Internal server error\",\n",
            "        \"type\": \"auth_subrequest_error\",\n",
            "        \"param\": null,\n",
            "        \"code\": \"internal_error\"\n",
            "    }\n",
            "}\n",
            " 500 {'error': {'message': 'Internal server error', 'type': 'auth_subrequest_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Fri, 20 Oct 2023 04:14:00 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '166', 'Connection': 'keep-alive', 'vary': 'Origin', 'x-request-id': '03498e07503db2ca9b3f7a1067ab14e6', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '818e6e572e777ba4-LAX', 'alt-svc': 'h3=\":443\"; ma=86400'}.\n",
            " 38%|███▊      | 5/13 [00:27<00:54,  6.81s/it]WARNING:langchain.embeddings.openai:Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: Internal server error {\n",
            "    \"error\": {\n",
            "        \"message\": \"Internal server error\",\n",
            "        \"type\": \"auth_subrequest_error\",\n",
            "        \"param\": null,\n",
            "        \"code\": \"internal_error\"\n",
            "    }\n",
            "}\n",
            " 500 {'error': {'message': 'Internal server error', 'type': 'auth_subrequest_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Fri, 20 Oct 2023 04:14:09 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '166', 'Connection': 'keep-alive', 'vary': 'Origin', 'x-request-id': '3a1056522b79640b0b5588f8fe9cd9b5', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '818e6e909d7e7ba4-LAX', 'alt-svc': 'h3=\":443\"; ma=86400'}.\n",
            " 46%|████▌     | 6/13 [00:37<00:53,  7.62s/it]WARNING:langchain.embeddings.openai:Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: Internal server error {\n",
            "    \"error\": {\n",
            "        \"message\": \"Internal server error\",\n",
            "        \"type\": \"auth_subrequest_error\",\n",
            "        \"param\": null,\n",
            "        \"code\": \"internal_error\"\n",
            "    }\n",
            "}\n",
            " 500 {'error': {'message': 'Internal server error', 'type': 'auth_subrequest_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Fri, 20 Oct 2023 04:14:19 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '166', 'Connection': 'keep-alive', 'vary': 'Origin', 'x-request-id': '77ddee53e8e4a7dbc4a13e301b6e2704', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '818e6ec9fe647ba4-LAX', 'alt-svc': 'h3=\":443\"; ma=86400'}.\n",
            "100%|██████████| 13/13 [00:50<00:00,  3.85s/it]\n",
            "100%|██████████| 1/1 [00:04<00:00,  4.19s/it]\n",
            " 42%|████▏     | 8/19 [00:01<00:01,  5.59it/s]WARNING:langchain.embeddings.openai:Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: Internal server error {\n",
            "    \"error\": {\n",
            "        \"message\": \"Internal server error\",\n",
            "        \"type\": \"auth_subrequest_error\",\n",
            "        \"param\": null,\n",
            "        \"code\": \"internal_error\"\n",
            "    }\n",
            "}\n",
            " 500 {'error': {'message': 'Internal server error', 'type': 'auth_subrequest_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Fri, 20 Oct 2023 04:14:38 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '166', 'Connection': 'keep-alive', 'vary': 'Origin', 'x-request-id': 'abd98a3b5003eb71e4d1c508415b487e', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '818e6f4249f07ba4-LAX', 'alt-svc': 'h3=\":443\"; ma=86400'}.\n",
            "100%|██████████| 19/19 [00:12<00:00,  1.51it/s]\n",
            "100%|██████████| 2/2 [00:05<00:00,  2.98s/it]\n",
            "100%|██████████| 21/21 [00:03<00:00,  5.84it/s]\n",
            "100%|██████████| 2/2 [00:15<00:00,  7.95s/it]\n",
            "100%|██████████| 22/22 [00:04<00:00,  5.39it/s]\n",
            "100%|██████████| 2/2 [00:17<00:00,  8.58s/it]\n",
            "100%|██████████| 22/22 [00:03<00:00,  5.94it/s]\n",
            "100%|██████████| 2/2 [00:14<00:00,  7.13s/it]\n",
            "100%|██████████| 25/25 [00:03<00:00,  6.37it/s]\n",
            "100%|██████████| 2/2 [00:18<00:00,  9.29s/it]\n",
            "100%|██████████| 36/36 [00:06<00:00,  5.56it/s]\n",
            "100%|██████████| 3/3 [00:11<00:00,  3.86s/it]\n",
            "100%|██████████| 37/37 [00:05<00:00,  6.77it/s]\n",
            "100%|██████████| 3/3 [00:21<00:00,  7.09s/it]\n",
            "100%|██████████| 41/41 [00:05<00:00,  7.01it/s]\n",
            "100%|██████████| 3/3 [00:16<00:00,  5.62s/it]\n",
            "100%|██████████| 44/44 [00:06<00:00,  6.45it/s]\n",
            "100%|██████████| 3/3 [00:28<00:00,  9.55s/it]\n",
            "100%|██████████| 51/51 [00:07<00:00,  6.69it/s]\n",
            "100%|██████████| 4/4 [00:19<00:00,  4.90s/it]\n",
            "100%|██████████| 54/54 [00:08<00:00,  6.30it/s]\n",
            "100%|██████████| 4/4 [00:14<00:00,  3.60s/it]\n",
            "100%|██████████| 58/58 [00:09<00:00,  6.19it/s]\n",
            "100%|██████████| 4/4 [00:43<00:00, 10.81s/it]\n",
            "100%|██████████| 58/58 [00:09<00:00,  5.98it/s]\n",
            "100%|██████████| 4/4 [00:15<00:00,  4.00s/it]\n",
            "100%|██████████| 58/58 [00:08<00:00,  6.86it/s]\n",
            "100%|██████████| 4/4 [00:42<00:00, 10.67s/it]\n",
            "100%|██████████| 62/62 [00:09<00:00,  6.51it/s]\n",
            "100%|██████████| 4/4 [00:44<00:00, 11.16s/it]\n",
            "100%|██████████| 63/63 [00:09<00:00,  6.36it/s]\n",
            "100%|██████████| 4/4 [00:29<00:00,  7.39s/it]\n",
            "100%|██████████| 66/66 [00:10<00:00,  6.37it/s]\n",
            "100%|██████████| 5/5 [00:48<00:00,  9.77s/it]\n",
            "100%|██████████| 66/66 [00:10<00:00,  6.15it/s]\n",
            "100%|██████████| 5/5 [00:46<00:00,  9.25s/it]\n",
            "100%|██████████| 66/66 [00:09<00:00,  6.63it/s]\n",
            "100%|██████████| 5/5 [00:23<00:00,  4.77s/it]\n",
            "100%|██████████| 67/67 [00:10<00:00,  6.29it/s]\n",
            "100%|██████████| 5/5 [00:49<00:00,  9.86s/it]\n",
            "100%|██████████| 67/67 [00:11<00:00,  5.71it/s]\n",
            "100%|██████████| 5/5 [00:24<00:00,  4.95s/it]\n",
            "100%|██████████| 70/70 [00:11<00:00,  6.07it/s]\n",
            "100%|██████████| 5/5 [00:29<00:00,  5.88s/it]\n",
            "100%|██████████| 70/70 [00:11<00:00,  6.24it/s]\n",
            "100%|██████████| 5/5 [00:50<00:00, 10.17s/it]\n",
            "100%|██████████| 70/70 [00:09<00:00,  7.21it/s]\n",
            "100%|██████████| 5/5 [00:18<00:00,  3.71s/it]\n",
            "100%|██████████| 72/72 [00:11<00:00,  6.13it/s]\n",
            "100%|██████████| 5/5 [00:26<00:00,  5.24s/it]\n",
            "100%|██████████| 72/72 [00:11<00:00,  6.46it/s]\n",
            "100%|██████████| 5/5 [00:52<00:00, 10.54s/it]\n",
            "100%|██████████| 73/73 [00:11<00:00,  6.57it/s]\n",
            "100%|██████████| 5/5 [00:53<00:00, 10.75s/it]\n",
            "100%|██████████| 74/74 [00:13<00:00,  5.62it/s]\n",
            "100%|██████████| 5/5 [00:24<00:00,  4.98s/it]\n",
            "100%|██████████| 76/76 [00:11<00:00,  6.86it/s]\n",
            "100%|██████████| 5/5 [00:23<00:00,  4.65s/it]\n",
            "100%|██████████| 77/77 [00:11<00:00,  6.70it/s]\n",
            "100%|██████████| 5/5 [00:57<00:00, 11.54s/it]\n",
            "100%|██████████| 78/78 [00:11<00:00,  6.68it/s]\n",
            "100%|██████████| 5/5 [00:36<00:00,  7.26s/it]\n",
            "100%|██████████| 78/78 [00:11<00:00,  6.79it/s]\n",
            "100%|██████████| 5/5 [00:22<00:00,  4.56s/it]\n",
            "100%|██████████| 82/82 [00:12<00:00,  6.40it/s]\n",
            "100%|██████████| 6/6 [01:04<00:00, 10.68s/it]\n",
            "100%|██████████| 83/83 [00:11<00:00,  7.01it/s]\n",
            "100%|██████████| 6/6 [00:52<00:00,  8.76s/it]\n",
            "100%|██████████| 84/84 [00:12<00:00,  6.80it/s]\n",
            "100%|██████████| 6/6 [00:23<00:00,  3.91s/it]\n",
            "100%|██████████| 87/87 [00:14<00:00,  6.11it/s]\n",
            "100%|██████████| 6/6 [01:04<00:00, 10.70s/it]\n",
            "100%|██████████| 88/88 [00:13<00:00,  6.54it/s]\n",
            "100%|██████████| 6/6 [01:01<00:00, 10.28s/it]\n",
            "100%|██████████| 92/92 [00:15<00:00,  5.91it/s]\n",
            "100%|██████████| 6/6 [00:27<00:00,  4.59s/it]\n",
            "100%|██████████| 92/92 [00:15<00:00,  6.04it/s]\n",
            "100%|██████████| 6/6 [01:07<00:00, 11.23s/it]\n",
            "100%|██████████| 93/93 [00:15<00:00,  6.14it/s]\n",
            "100%|██████████| 6/6 [01:07<00:00, 11.32s/it]\n",
            "100%|██████████| 93/93 [00:13<00:00,  6.96it/s]\n",
            "100%|██████████| 6/6 [00:20<00:00,  3.35s/it]\n",
            "100%|██████████| 94/94 [00:14<00:00,  6.51it/s]\n",
            "100%|██████████| 6/6 [00:48<00:00,  8.10s/it]\n",
            "100%|██████████| 95/95 [00:16<00:00,  5.77it/s]\n",
            "100%|██████████| 6/6 [00:31<00:00,  5.25s/it]\n",
            "100%|██████████| 97/97 [00:16<00:00,  6.05it/s]\n",
            "100%|██████████| 7/7 [01:10<00:00, 10.14s/it]\n",
            "100%|██████████| 98/98 [00:15<00:00,  6.40it/s]\n",
            "100%|██████████| 7/7 [01:11<00:00, 10.28s/it]\n",
            "100%|██████████| 99/99 [00:15<00:00,  6.46it/s]\n",
            "100%|██████████| 7/7 [00:30<00:00,  4.39s/it]\n",
            "100%|██████████| 101/101 [00:15<00:00,  6.39it/s]\n",
            "100%|██████████| 7/7 [00:36<00:00,  5.19s/it]\n",
            "100%|██████████| 104/104 [00:18<00:00,  5.75it/s]\n",
            "100%|██████████| 7/7 [01:15<00:00, 10.86s/it]\n",
            "100%|██████████| 105/105 [00:16<00:00,  6.29it/s]\n",
            "100%|██████████| 7/7 [00:34<00:00,  5.00s/it]\n",
            "100%|██████████| 105/105 [00:16<00:00,  6.25it/s]\n",
            "100%|██████████| 7/7 [00:49<00:00,  7.11s/it]\n",
            "100%|██████████| 106/106 [00:16<00:00,  6.36it/s]\n",
            "100%|██████████| 7/7 [00:27<00:00,  3.99s/it]\n",
            "100%|██████████| 106/106 [00:17<00:00,  6.11it/s]\n",
            "100%|██████████| 7/7 [01:15<00:00, 10.73s/it]\n",
            "100%|██████████| 108/108 [00:16<00:00,  6.41it/s]\n",
            "100%|██████████| 7/7 [00:31<00:00,  4.50s/it]\n",
            "100%|██████████| 108/108 [00:18<00:00,  5.98it/s]\n",
            "100%|██████████| 7/7 [01:15<00:00, 10.85s/it]\n",
            "100%|██████████| 109/109 [00:16<00:00,  6.79it/s]\n",
            "100%|██████████| 7/7 [00:57<00:00,  8.21s/it]\n",
            "100%|██████████| 110/110 [00:18<00:00,  6.07it/s]\n",
            "100%|██████████| 7/7 [00:31<00:00,  4.50s/it]\n",
            "100%|██████████| 111/111 [00:17<00:00,  6.28it/s]\n",
            "100%|██████████| 7/7 [00:51<00:00,  7.41s/it]\n",
            "100%|██████████| 112/112 [00:17<00:00,  6.49it/s]\n",
            "100%|██████████| 7/7 [01:12<00:00, 10.30s/it]\n",
            "100%|██████████| 113/113 [00:18<00:00,  6.18it/s]\n",
            "100%|██████████| 8/8 [00:42<00:00,  5.37s/it]\n",
            "100%|██████████| 121/121 [00:18<00:00,  6.72it/s]\n",
            "100%|██████████| 8/8 [00:46<00:00,  5.82s/it]\n",
            "100%|██████████| 122/122 [00:19<00:00,  6.27it/s]\n",
            "100%|██████████| 8/8 [00:32<00:00,  4.09s/it]\n",
            "100%|██████████| 122/122 [00:19<00:00,  6.24it/s]\n",
            "100%|██████████| 8/8 [01:05<00:00,  8.23s/it]\n",
            "100%|██████████| 123/123 [00:18<00:00,  6.78it/s]\n",
            "100%|██████████| 8/8 [01:06<00:00,  8.26s/it]\n",
            "100%|██████████| 126/126 [00:18<00:00,  6.84it/s]\n",
            "100%|██████████| 8/8 [00:38<00:00,  4.75s/it]\n",
            "100%|██████████| 128/128 [00:18<00:00,  6.76it/s]\n",
            "100%|██████████| 8/8 [01:13<00:00,  9.13s/it]\n",
            "100%|██████████| 131/131 [00:21<00:00,  6.13it/s]\n",
            "100%|██████████| 9/9 [01:29<00:00,  9.93s/it]\n",
            "100%|██████████| 133/133 [00:19<00:00,  6.72it/s]\n",
            "100%|██████████| 9/9 [00:38<00:00,  4.30s/it]\n",
            "100%|██████████| 134/134 [00:26<00:00,  5.08it/s]\n",
            "100%|██████████| 9/9 [01:22<00:00,  9.22s/it]\n",
            "100%|██████████| 135/135 [00:21<00:00,  6.27it/s]\n",
            "100%|██████████| 9/9 [00:40<00:00,  4.49s/it]\n",
            "100%|██████████| 138/138 [00:21<00:00,  6.30it/s]\n",
            "100%|██████████| 9/9 [00:48<00:00,  5.34s/it]\n",
            "100%|██████████| 140/140 [00:21<00:00,  6.41it/s]\n",
            "100%|██████████| 9/9 [00:36<00:00,  4.05s/it]\n",
            "100%|██████████| 151/151 [00:21<00:00,  6.95it/s]\n",
            "100%|██████████| 10/10 [00:40<00:00,  4.07s/it]\n",
            "100%|██████████| 152/152 [00:25<00:00,  6.05it/s]\n",
            "100%|██████████| 10/10 [00:46<00:00,  4.65s/it]\n",
            "100%|██████████| 153/153 [00:23<00:00,  6.63it/s]\n",
            "100%|██████████| 10/10 [00:44<00:00,  4.46s/it]\n",
            "100%|██████████| 153/153 [00:23<00:00,  6.47it/s]\n",
            "100%|██████████| 10/10 [01:42<00:00, 10.30s/it]\n",
            "100%|██████████| 156/156 [00:24<00:00,  6.28it/s]\n",
            "100%|██████████| 10/10 [01:56<00:00, 11.60s/it]\n",
            "100%|██████████| 163/163 [00:28<00:00,  5.74it/s]\n",
            "100%|██████████| 11/11 [00:49<00:00,  4.48s/it]\n",
            "100%|██████████| 169/169 [00:26<00:00,  6.28it/s]\n",
            "100%|██████████| 11/11 [00:49<00:00,  4.48s/it]\n",
            "100%|██████████| 180/180 [00:28<00:00,  6.41it/s]\n",
            "100%|██████████| 12/12 [01:09<00:00,  5.77s/it]\n",
            "100%|██████████| 185/185 [00:27<00:00,  6.75it/s]\n",
            "100%|██████████| 12/12 [02:12<00:00, 11.04s/it]\n",
            "100%|██████████| 195/195 [00:29<00:00,  6.59it/s]\n",
            "100%|██████████| 13/13 [00:50<00:00,  3.89s/it]\n",
            "100%|██████████| 201/201 [00:29<00:00,  6.92it/s]\n",
            "100%|██████████| 13/13 [01:24<00:00,  6.52s/it]\n",
            "100%|██████████| 208/208 [00:31<00:00,  6.67it/s]\n",
            "100%|██████████| 13/13 [00:35<00:00,  2.70s/it]\n",
            "100%|██████████| 210/210 [00:32<00:00,  6.51it/s]\n",
            "100%|██████████| 14/14 [02:05<00:00,  8.98s/it]\n",
            "100%|██████████| 246/246 [00:36<00:00,  6.77it/s]\n",
            "100%|██████████| 16/16 [01:29<00:00,  5.62s/it]\n",
            "100%|██████████| 500/500 [01:18<00:00,  6.38it/s]\n",
            "100%|██████████| 32/32 [06:04<00:00, 11.39s/it]\n",
            "100%|██████████| 527/527 [01:21<00:00,  6.44it/s]\n",
            "100%|██████████| 33/33 [06:25<00:00, 11.69s/it]\n",
            "100%|██████████| 1262/1262 [03:27<00:00,  6.08it/s]\n",
            "100%|██████████| 79/79 [15:59<00:00, 12.15s/it]\n",
            "  1%|          | 21/1716 [00:03<04:25,  6.38it/s]"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "warning!\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "  4%|▍         | 66/1716 [00:10<03:42,  7.41it/s]"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "warning!\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "  4%|▍         | 72/1716 [00:11<04:12,  6.52it/s]"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "warning!\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "  5%|▍         | 79/1716 [00:13<04:40,  5.83it/s]"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "warning!\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "  6%|▌         | 95/1716 [00:15<03:35,  7.51it/s]"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "for role_name in sorted_names:\n",
        "    save_name = \"/content/drive/MyDrive/Role_from_RoleLLM/\" + role_name + \".jsonl\"\n",
        "    if os.path.exists(save_name):\n",
        "        print('skip ',role_name)\n",
        "        continue\n",
        "    # continue\n",
        "\n",
        "    script_path = get_script_path(role_name)\n",
        "    data = read_jsonl(script_path)\n",
        "    chunk_data = divide_data_into_chunk(data)\n",
        "\n",
        "    if len(chunk_data) > 1500:\n",
        "        chunk_data = chunk_data[:1500]\n",
        "\n",
        "    embed_openai = get_embeddings_openai(chunk_data[:])\n",
        "    embed_bge = get_bge_embeddings_safe(chunk_data[:])\n",
        "\n",
        "    datas = []\n",
        "\n",
        "    system_prompt = get_system_prompt( role_name )\n",
        "\n",
        "    embed_name = 'luotuo_openai'\n",
        "    embed_name_bge = 'bge_en_s15'\n",
        "\n",
        "    datas.append({ 'text':system_prompt , embed_name:'system_prompt', embed_name_bge:'system_prompt'})\n",
        "    datas.append({ 'text':'Reserve Config Setting Here' , embed_name:'config', embed_name_bge:'config'})\n",
        "\n",
        "    from ChatHaruhi.utils import float_array_to_base64\n",
        "\n",
        "    for text,embed1, embed2 in zip(chunk_data, embed_openai, embed_bge):\n",
        "        encode_vec1 = float_array_to_base64(embed1)\n",
        "        encode_vec2 = float_array_to_base64(embed2)\n",
        "        datas.append({ 'text':text , embed_name:encode_vec1, embed_name_bge:encode_vec2})\n",
        "\n",
        "\n",
        "    write_jsonl(datas,save_name)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "IIba46JISec_",
        "outputId": "3557db88-930e-437e-93a4-4f95172f7018"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "skip  Caesar\n",
            "skip  Sonny\n",
            "skip  Angel\n",
            "skip  Jigsaw\n",
            "skip  John Doe\n",
            "skip  Freddy Krueger\n",
            "skip  Colonel Hans Landa\n",
            "skip  Gregory House\n",
            "skip  Gaston\n",
            "skip  HAL 9000\n",
            "skip  Mark Renton\n",
            "skip  Coriolanus\n",
            "skip  Oliver Queen\n",
            "skip  Bruno Antony\n",
            "skip  D_Artagnan\n",
            "skip  Dr. Frank-N-Furter\n",
            "skip  Tugg Speedman\n",
            "skip  Stifler\n",
            "skip  Jeff Spicoli\n",
            "skip  Rorschach\n",
            "skip  Paul Vitti\n",
            "skip  Logan\n",
            "skip  Judge Dredd\n",
            "skip  Karl Childers\n",
            "skip  Rachel Lang\n",
            "skip  Queen Elizabeth I\n",
            "skip  Tyrion Lannister\n",
            "skip  John Keating\n",
            "skip  Wade Wilson\n",
            "skip  Lyn Cassady\n",
            "skip  Dr. Hannibal Lecter\n",
            "skip  Violet Weston\n",
            "skip  Po\n",
            "skip  Malcolm X\n",
            "skip  Willie Soke\n",
            "skip  Jack Torrance\n",
            "skip  Alvy Singer\n",
            "skip  Colonel Nathan R. Jessep\n",
            "skip  Andrew Detmer\n",
            "skip  Fred Flintstone\n",
            "skip  Frank T.J. Mackey\n",
            "skip  Stephen Hawking\n",
            "skip  Lestat de Lioncourt\n",
            "skip  Jack Sparrow\n",
            "skip  John Coffey\n",
            "skip  Murphy MacManus\n",
            "skip  John Dillinger\n",
            "skip  Jackie Moon\n",
            "skip  Peter Parker\n",
            "skip  Abraham Lincoln\n",
            "skip  James Carter\n",
            "skip  Tyler Hawkins\n",
            "skip  Stanley Ipkiss\n",
            "skip  Mater\n",
            "skip  Professor G.H. Dorr\n",
            "skip  Juno MacGuff\n",
            "skip  Seth\n",
            "skip  Sherlock Holmes\n",
            "skip  Truman Capote\n",
            "skip  Shrek\n",
            "skip  Travis Bickle\n",
            "skip  Jack\n",
            "skip  Tom Ripley\n",
            "skip  The Dude\n",
            "skip  David Aames\n",
            "skip  Twilight Sparkle\n",
            "skip  Antonio Salieri\n",
            "skip  Judy Hoops\n",
            "skip  Randle McMurphy\n",
            "skip  Thor\n",
            "skip  Walt Kowalski\n",
            "skip  Fletcher Reede\n",
            "skip  Theodore Twombly\n",
            "skip  James Brown\n",
            "skip  Paul Conroy\n",
            "skip  James Bond\n",
            "skip  Queen Catherine\n",
            "skip  Harvey Milk\n",
            "skip  Caden Cotard\n",
            "skip  Leonard Shelby\n",
            "skip  Jim Morrison\n",
            "skip  Pat Solitano\n",
            "skip  Benjamin Button\n",
            "skip  Robert Angier\n",
            "skip  Lucifer Morningstar\n",
            "skip  Jordan Belfort\n",
            "skip  Coach Eric Taylor\n",
            "skip  Mary Sibley\n",
            "skip  Klaus Mikaelson\n",
            "skip  Raylan Givens\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|██████████| 1500/1500 [03:51<00:00,  6.49it/s]\n",
            "100%|██████████| 47/47 [17:19<00:00, 22.11s/it]\n",
            "100%|██████████| 1500/1500 [03:44<00:00,  6.67it/s]\n",
            "100%|██████████| 47/47 [16:28<00:00, 21.03s/it]\n",
            "100%|██████████| 1500/1500 [03:54<00:00,  6.40it/s]\n",
            "100%|██████████| 47/47 [17:25<00:00, 22.24s/it]\n",
            "100%|██████████| 1500/1500 [03:47<00:00,  6.59it/s]\n",
            "100%|██████████| 47/47 [18:35<00:00, 23.74s/it]\n",
            "100%|██████████| 1500/1500 [04:05<00:00,  6.10it/s]\n",
            "100%|██████████| 47/47 [18:46<00:00, 23.97s/it]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# embed_bge = get_bge_embeddings_safe(chunk_data[:])\n",
        "\n",
        "datas = []\n",
        "\n",
        "system_prompt = get_system_prompt( role_name )\n",
        "\n",
        "embed_name = 'luotuo_openai'\n",
        "embed_name_bge = 'bge_en_s15'\n",
        "\n",
        "datas.append({ 'text':system_prompt , embed_name:'system_prompt', embed_name_bge:'system_prompt'})\n",
        "datas.append({ 'text':'Reserve Config Setting Here' , embed_name:'config', embed_name_bge:'config'})\n",
        "\n",
        "from ChatHaruhi.utils import float_array_to_base64\n",
        "\n",
        "for text,embed1, embed2 in zip(chunk_data, embed_openai, embed_bge):\n",
        "    encode_vec1 = float_array_to_base64(embed1)\n",
        "    encode_vec2 = float_array_to_base64(embed2)\n",
        "    datas.append({ 'text':text , embed_name:encode_vec1, embed_name_bge:encode_vec2})\n",
        "\n",
        "\n",
        "write_jsonl(datas,save_name)"
      ],
      "metadata": {
        "id": "Dp4uL56gvfZL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "6GJrxKVFwxrH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 生成github的描述"
      ],
      "metadata": {
        "id": "zuro2xvbP66G"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "desc_path = \"/content/RoleBench/profiles-eng/desc.json\"\n",
        "movie_name_path = \"/content/RoleBench/profiles-eng/scripts.json\"\n",
        "\n",
        "# 请补全下面的python代码为我解析这两个json文件\n",
        "\n",
        "import json\n",
        "\n",
        "with open(desc_path) as f:\n",
        "    desc = json.load(f)\n",
        "\n",
        "with open(movie_name_path) as f:\n",
        "    movie_names = json.load(f)\n"
      ],
      "metadata": {
        "id": "GW7DR3YnP4fZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print('''为下面的markdown表格的第三列，增加这个角色的中文描述。\n",
        "\n",
        "\n",
        "例子输入:\n",
        "Jack Sparrow  |  Pirates-of-the-Caribbean-Dead-Man's-Chest  |\n",
        "\n",
        "例子输出:\n",
        "Jack Sparrow  |  Pirates-of-the-Caribbean-Dead-Man's-Chest  | 加勒比海盗中的杰克船长\n",
        "\n",
        "输入:\n",
        "''')\n",
        "\n",
        "print('角色 | 电影 | 中文')\n",
        "\n",
        "print('---|---|---')\n",
        "\n",
        "for role_name in movie_names:\n",
        "    print(role_name , ' | ', movie_names[role_name] , ' | ', )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VWkYdVlYP6Ic",
        "outputId": "fa22e85f-7eae-464f-c468-77a1e9343acb"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "为下面的markdown表格的第三列，增加这个角色的中文描述。\n",
            "\n",
            "\n",
            "例子输入:\n",
            "Jack Sparrow  |  Pirates-of-the-Caribbean-Dead-Man's-Chest  | \n",
            "\n",
            "例子输出:\n",
            "Jack Sparrow  |  Pirates-of-the-Caribbean-Dead-Man's-Chest  | 加勒比海盗中的杰克船长\n",
            "\n",
            "输入:\n",
            "\n",
            "角色 | 电影 | 中文\n",
            "---|---|---\n",
            "HAL 9000  |  2001-A-Space-Odyssey  | \n",
            "Colonel Nathan R. Jessep  |  A-Few-Good-Men  | \n",
            "Antonio Salieri  |  Amadeus  | \n",
            "Stifler  |  American-Pie  | \n",
            "Paul Vitti  |  Analyze-That  | \n",
            "Alvy Singer  |  Annie-Hall  | \n",
            "Violet Weston  |  August-Osage-County  | \n",
            "Willie Soke  |  Bad-Santa  | \n",
            "Gaston  |  Beauty-and-the-Beast  | \n",
            "The Dude  |  Big-Lebowski,-The  | \n",
            "Murphy MacManus  |  Boondock-Saints,-The  | \n",
            "Paul Conroy  |  Buried  | \n",
            "Truman Capote  |  Capote  | \n",
            "Mater  |  Cars-2  | \n",
            "Andrew Detmer  |  Chronicle  | \n",
            "Coriolanus  |  Coriolanus  | \n",
            "Benjamin Button  |  Curious-Case-of-Benjamin-Button,-The  | \n",
            "John Keating  |  Dead-Poets-Society  | \n",
            "Wade Wilson  |  Deadpool  | \n",
            "Jim Morrison  |  Doors,-The  | \n",
            "Queen Elizabeth I  |  Elizabeth-The-Golden-Age  | \n",
            "Jeff Spicoli  |  Fast-Times-at-Ridgemont-High  | \n",
            "Fred Flintstone  |  Flintstones,-The  | \n",
            "Freddy Krueger  |  Freddy-vs.-Jason  | \n",
            "Tyrion Lannister  |  Game_of_Thrones  | \n",
            "James Brown  |  Get-on-Up  | \n",
            "Walt Kowalski  |  Gran-Torino  | \n",
            "John Coffey  |  Green-Mile,-The  | \n",
            "Theodore Twombly  |  Her  | \n",
            "Gregory House  |  House-M.D.  | \n",
            "Sonny  |  I,-Robot  | \n",
            "Colonel Hans Landa  |  Inglourious-Basterds  | \n",
            "Judge Dredd  |  Judge-Dredd  | \n",
            "Juno MacGuff  |  Juno  | \n",
            "Po  |  Kung-Fu-Panda  | \n",
            "Professor G.H. Dorr  |  Ladykillers,-The  | \n",
            "Fletcher Reede  |  Liar-Liar  | \n",
            "Abraham Lincoln  |  Lincoln  | \n",
            "Frank T.J. Mackey  |  Magnolia  | \n",
            "Malcolm X  |  Malcolm-X  | \n",
            "Leonard Shelby  |  Memento  | \n",
            "Harvey Milk  |  Milk  | \n",
            "Randle McMurphy  |  One-Flew-Over-the-Cuckoo's-Nest  | \n",
            "Jack Sparrow  |  Pirates-of-the-Caribbean-Dead-Man's-Chest  | \n",
            "John Dillinger  |  Public-Enemies  | \n",
            "Lestat de Lioncourt  |  Queen-of-the-Damned  | \n",
            "Tyler Hawkins  |  Remember-Me  | \n",
            "Caesar  |  Rise-of-the-Planet-of-the-Apes  | \n",
            "Jack  |  Room  | \n",
            "James Carter  |  Rush-Hour-2  | \n",
            "Jigsaw  |  Saw  | \n",
            "John Doe  |  Se7en  | \n",
            "Jackie Moon  |  Semi-Pro  | \n",
            "Sherlock Holmes  |  Sherlock-Holmes  | \n",
            "Shrek  |  Shrek  | \n",
            "Pat Solitano  |  Silver-Linings-Playbook  | \n",
            "Karl Childers  |  Sling-Blade  | \n",
            "Peter Parker  |  Spider-Man  | \n",
            "Bruno Antony  |  Strangers-on-a-Train  | \n",
            "Seth  |  Superbad  | \n",
            "Caden Cotard  |  Synecdoche,-New-York  | \n",
            "Travis Bickle  |  Taxi-Driver  | \n",
            "Stanley Ipkiss  |  Mask,-The  | \n",
            "Lyn Cassady  |  Men-Who-Stare-at-Goats,-The  | \n",
            "Michael Scott  |  The_Office  | \n",
            "Robert Angier  |  Prestige,-The  | \n",
            "Rachel Lang  |  The-Rage-Carrie-2  | \n",
            "Dr. Frank-N-Furter  |  Rocky-Horror-Picture-Show,-The  | \n",
            "Jack Torrance  |  Shining,-The  | \n",
            "Tom Ripley  |  Talented-Mr.-Ripley,-The  | \n",
            "D_Artagnan  |  Three-Musketeers,-The  | \n",
            "Stephen Hawking  |  Theory-of-Everything,-The  | \n",
            "Thor  |  Thor-Ragnarok  | \n",
            "James Bond  |  Tomorrow-Never-Dies  | \n",
            "Mark Renton  |  Trainspotting  | \n",
            "Tugg Speedman  |  Tropic-Thunder  | \n",
            "David Aames  |  Vanilla-Sky  | \n",
            "Rorschach  |  Watchmen  | \n",
            "Jordan Belfort  |  Wolf-of-Wall-Street,-The  | \n",
            "Logan  |  X-Men-Origins-Wolverine  | \n",
            "Judy Hoops  |  Zootopia  | \n",
            "Doctor Who  |  Doctor_Who  | \n",
            "Blair Waldorf  |  Gossip_Girl  | \n",
            "Raylan Givens  |  Justified  | \n",
            "Mary Sibley  |  Salem  | \n",
            "Lucifer Morningstar  |  Lucifer  | \n",
            "Sheldon Cooper  |  The_Big_Bang_Theory  | \n",
            "Twilight Sparkle  |  My_Little_Pony__Friendship_is_Magic  | \n",
            "Oliver Queen  |  Arrow  | \n",
            "Leroy Jethro Gibbs  |  NCIS  | \n",
            "Angel  |  Angel  | \n",
            "Klaus Mikaelson  |  The_Originals  | \n",
            "Queen Catherine  |  Reign  | \n",
            "Dr. Hannibal Lecter  |  Hannibal  | \n",
            "Coach Eric Taylor  |  Friday_Night_Lights  | \n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "ch_desc = \"\"\"《2001太空漫游》中的HAL 9000电脑\n",
        "《好汉两三个》中的内森·R·杰瑟普上校\n",
        "《阿玛迪斯》中的安东尼奥·萨列里\n",
        "《美国派》中的斯蒂夫勒\n",
        "《心理分析那件事》中的保罗·维蒂\n",
        "《安妮·霍尔》中的阿尔维·辛格\n",
        "《奥塞奇郡的八月》中的紫罗兰·韦斯顿\n",
        "《坏圣诞老人》中的威利·索克\n",
        "《美女与野兽》中的加斯顿\n",
        "《大勒布斯基》中的“大佬”\n",
        "《天使之城》中的墨菲·麦克马纳斯\n",
        "《活埋》中的保罗·康罗伊\n",
        "《卡波特》中的杜鲁门·卡波特\n",
        "《赛车总动员2》中的玛特\n",
        "《编年史》中的安德鲁·德特默\n",
        "《科里奥兰纳斯》中的主角\n",
        "《本杰明·巴顿奇事》中的本杰明·巴顿\n",
        "《死亡诗社》中的约翰·基廷\n",
        "《死侍》中的韦德·威尔逊\n",
        "《门》中的吉姆·莫里森\n",
        "《伊丽莎白：黄金时代》中的伊丽莎白一世女王\n",
        "《瑞奇蒙特高中时光》中的杰夫·斯皮科利\n",
        "《石头家族》中的弗雷德·弗林斯通\n",
        "《弗雷迪对杰森》中的弗雷迪·克鲁格\n",
        "《权力的游戏》中的提利昂·兰尼斯特\n",
        "《起身舞蹈》中的詹姆斯·布朗\n",
        "《老无所依》中的沃尔特·科瓦尔斯基\n",
        "《绿里奇迹》中的约翰·科菲\n",
        "《她》中的西奥多·特温布利\n",
        "《豪斯医生》中的格雷戈里·豪斯\n",
        "《我，机器人》中的桑尼\n",
        "《无耻混蛋》中的汉斯·兰达上校\n",
        "《德莱德法官》中的法官德莱德\n",
        "《朱诺》中的朱诺·麦克夫\n",
        "《功夫熊猫》中的阿宝\n",
        "《夫人杀手》中的G.H.多尔教授\n",
        "《撒谎的男人》中的弗莱彻·里德\n",
        "《林肯》中的亚伯拉罕·林肯\n",
        "《木兰花》中的弗兰克 T.J. 麦凯\n",
        "《马尔科姆X》中的马尔科姆X\n",
        "《记忆碎片》中的伦纳德·谢尔比\n",
        "《牛奶》中的哈维·牛奶\n",
        "《飞越疯人院》中的兰德尔·麦克默菲\n",
        "《加勒比海盗》中的杰克·斯派洛船长\n",
        "《公敌》中的约翰·迪林格\n",
        "《诅咒女王》中的莱斯塔特·德·莱昂科特\n",
        "《记得我》中的泰勒·霍金斯\n",
        "《猩球崛起》中的凯撒\n",
        "《房间》中的杰克\n",
        "《尖峰时刻2》中的詹姆斯·卡特\n",
        "《电锯惊魂》中的拼图杀手\n",
        "《七宗罪》中的约翰·多\n",
        "《半职业球员》中的杰基·月亮\n",
        "《夏洛克·福尔摩斯》中的夏洛克·福尔摩斯\n",
        "《史莱克》中的史莱克\n",
        "《乌云背后的幸福线》中的帕特·索利塔诺\n",
        "《刀锯》中的卡尔·柴尔德斯\n",
        "《蜘蛛侠》中的彼得·帕克\n",
        "《列车上的陌生人》中的布鲁诺·安东尼\n",
        "《超级糟糕》中的塞思\n",
        "《纽约奇缘》中的卡登·科塔德\n",
        "《出租车司机》中的特拉维斯·比克尔\n",
        "《面具》中的斯坦利·伊普基斯\n",
        "《盯羊的男人》中的林恩·卡萨迪\n",
        "《办公室》中的迈克尔·斯科特\n",
        "《名望》中的罗伯特·安吉尔\n",
        "《瑞秋的愤怒：凯丽2》中的瑞秋·朗\n",
        "《洛奇恐怖秀》中的弗兰克·N·福特医生\n",
        "《闪灵》中的杰克·托兰斯\n",
        "《天才雷普利》中的汤姆·雷普利\n",
        "《三剑客》中的达达尼昂\n",
        "《万物理论》中的斯蒂芬·霍金\n",
        "《雷神：诸神黄昏》中的雷神索尔\n",
        "《明日帝国》中的詹姆斯·邦德\n",
        "《迷幻列车》中的马克·伦顿\n",
        "《热带惊雷》中的塔格·斯皮德曼\n",
        "《香草天空》中的大卫·艾姆斯\n",
        "《守望者》中的罗夏克\n",
        "《华尔街之狼》中的乔丹·贝尔福特\n",
        "《X战警：金刚狼》中的洛根\n",
        "《疯狂动物城》中的朱迪·胡普斯\n",
        "《神秘博士》中的博士\n",
        "《绯闻女孩》中的布莱尔·沃尔多夫\n",
        "《正当防卫》中的雷兰·吉文斯\n",
        "《塞勒姆》中的玛丽·西布利\n",
        "《路西法》中的路西法·晨星\n",
        "《生活大爆炸》中的谢尔顿·库珀\n",
        "《我的小马驹：友谊之魔》中的暮光星辉\n",
        "《绿箭侠》中的奥利弗·皇后\n",
        "《海军罪案调查处》中的利洛伊·杰斯罗·吉布斯\n",
        "《天使》中的天使\n",
        "《始祖家族》中的克劳斯·米卡尔森\n",
        "《王权》中的凯瑟琳女王\n",
        "《汉尼拔》中的汉尼拔·莱克特医生\n",
        "《星期五之光》中的教练埃里克·泰勒\"\"\"\n",
        "\n",
        "ch_descs = ch_desc.split('\\n')\n"
      ],
      "metadata": {
        "id": "-WK73B45Str3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print('角色 | 电影 | 中文 | 字段 ')\n",
        "\n",
        "print('---|---|---|---')\n",
        "\n",
        "count = 0\n",
        "\n",
        "for role_name in movie_names:\n",
        "    print(role_name , ' | ', movie_names[role_name] , ' | ', ch_descs[count] , '|' , \"silk-road/ChatHaruhi-from-RoleLLM/\" + role_name )\n",
        "    count += 1"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0R8wdigkQfIj",
        "outputId": "24839fa7-1569-4b7f-aaf0-e251c9fbcb18"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "角色 | 电影 | 中文 | 字段 \n",
            "---|---|---|---\n",
            "HAL 9000  |  2001-A-Space-Odyssey  |  《2001太空漫游》中的HAL 9000电脑 | silk-road/ChatHaruhi-from-RoleLLM/HAL 9000\n",
            "Colonel Nathan R. Jessep  |  A-Few-Good-Men  |  《好汉两三个》中的内森·R·杰瑟普上校 | silk-road/ChatHaruhi-from-RoleLLM/Colonel Nathan R. Jessep\n",
            "Antonio Salieri  |  Amadeus  |  《阿玛迪斯》中的安东尼奥·萨列里 | silk-road/ChatHaruhi-from-RoleLLM/Antonio Salieri\n",
            "Stifler  |  American-Pie  |  《美国派》中的斯蒂夫勒 | silk-road/ChatHaruhi-from-RoleLLM/Stifler\n",
            "Paul Vitti  |  Analyze-That  |  《心理分析那件事》中的保罗·维蒂 | silk-road/ChatHaruhi-from-RoleLLM/Paul Vitti\n",
            "Alvy Singer  |  Annie-Hall  |  《安妮·霍尔》中的阿尔维·辛格 | silk-road/ChatHaruhi-from-RoleLLM/Alvy Singer\n",
            "Violet Weston  |  August-Osage-County  |  《奥塞奇郡的八月》中的紫罗兰·韦斯顿 | silk-road/ChatHaruhi-from-RoleLLM/Violet Weston\n",
            "Willie Soke  |  Bad-Santa  |  《坏圣诞老人》中的威利·索克 | silk-road/ChatHaruhi-from-RoleLLM/Willie Soke\n",
            "Gaston  |  Beauty-and-the-Beast  |  《美女与野兽》中的加斯顿 | silk-road/ChatHaruhi-from-RoleLLM/Gaston\n",
            "The Dude  |  Big-Lebowski,-The  |  《大勒布斯基》中的“大佬” | silk-road/ChatHaruhi-from-RoleLLM/The Dude\n",
            "Murphy MacManus  |  Boondock-Saints,-The  |  《天使之城》中的墨菲·麦克马纳斯 | silk-road/ChatHaruhi-from-RoleLLM/Murphy MacManus\n",
            "Paul Conroy  |  Buried  |  《活埋》中的保罗·康罗伊 | silk-road/ChatHaruhi-from-RoleLLM/Paul Conroy\n",
            "Truman Capote  |  Capote  |  《卡波特》中的杜鲁门·卡波特 | silk-road/ChatHaruhi-from-RoleLLM/Truman Capote\n",
            "Mater  |  Cars-2  |  《赛车总动员2》中的玛特 | silk-road/ChatHaruhi-from-RoleLLM/Mater\n",
            "Andrew Detmer  |  Chronicle  |  《编年史》中的安德鲁·德特默 | silk-road/ChatHaruhi-from-RoleLLM/Andrew Detmer\n",
            "Coriolanus  |  Coriolanus  |  《科里奥兰纳斯》中的主角 | silk-road/ChatHaruhi-from-RoleLLM/Coriolanus\n",
            "Benjamin Button  |  Curious-Case-of-Benjamin-Button,-The  |  《本杰明·巴顿奇事》中的本杰明·巴顿 | silk-road/ChatHaruhi-from-RoleLLM/Benjamin Button\n",
            "John Keating  |  Dead-Poets-Society  |  《死亡诗社》中的约翰·基廷 | silk-road/ChatHaruhi-from-RoleLLM/John Keating\n",
            "Wade Wilson  |  Deadpool  |  《死侍》中的韦德·威尔逊 | silk-road/ChatHaruhi-from-RoleLLM/Wade Wilson\n",
            "Jim Morrison  |  Doors,-The  |  《门》中的吉姆·莫里森 | silk-road/ChatHaruhi-from-RoleLLM/Jim Morrison\n",
            "Queen Elizabeth I  |  Elizabeth-The-Golden-Age  |  《伊丽莎白：黄金时代》中的伊丽莎白一世女王 | silk-road/ChatHaruhi-from-RoleLLM/Queen Elizabeth I\n",
            "Jeff Spicoli  |  Fast-Times-at-Ridgemont-High  |  《瑞奇蒙特高中时光》中的杰夫·斯皮科利 | silk-road/ChatHaruhi-from-RoleLLM/Jeff Spicoli\n",
            "Fred Flintstone  |  Flintstones,-The  |  《石头家族》中的弗雷德·弗林斯通 | silk-road/ChatHaruhi-from-RoleLLM/Fred Flintstone\n",
            "Freddy Krueger  |  Freddy-vs.-Jason  |  《弗雷迪对杰森》中的弗雷迪·克鲁格 | silk-road/ChatHaruhi-from-RoleLLM/Freddy Krueger\n",
            "Tyrion Lannister  |  Game_of_Thrones  |  《权力的游戏》中的提利昂·兰尼斯特 | silk-road/ChatHaruhi-from-RoleLLM/Tyrion Lannister\n",
            "James Brown  |  Get-on-Up  |  《起身舞蹈》中的詹姆斯·布朗 | silk-road/ChatHaruhi-from-RoleLLM/James Brown\n",
            "Walt Kowalski  |  Gran-Torino  |  《老无所依》中的沃尔特·科瓦尔斯基 | silk-road/ChatHaruhi-from-RoleLLM/Walt Kowalski\n",
            "John Coffey  |  Green-Mile,-The  |  《绿里奇迹》中的约翰·科菲 | silk-road/ChatHaruhi-from-RoleLLM/John Coffey\n",
            "Theodore Twombly  |  Her  |  《她》中的西奥多·特温布利 | silk-road/ChatHaruhi-from-RoleLLM/Theodore Twombly\n",
            "Gregory House  |  House-M.D.  |  《豪斯医生》中的格雷戈里·豪斯 | silk-road/ChatHaruhi-from-RoleLLM/Gregory House\n",
            "Sonny  |  I,-Robot  |  《我，机器人》中的桑尼 | silk-road/ChatHaruhi-from-RoleLLM/Sonny\n",
            "Colonel Hans Landa  |  Inglourious-Basterds  |  《无耻混蛋》中的汉斯·兰达上校 | silk-road/ChatHaruhi-from-RoleLLM/Colonel Hans Landa\n",
            "Judge Dredd  |  Judge-Dredd  |  《德莱德法官》中的法官德莱德 | silk-road/ChatHaruhi-from-RoleLLM/Judge Dredd\n",
            "Juno MacGuff  |  Juno  |  《朱诺》中的朱诺·麦克夫 | silk-road/ChatHaruhi-from-RoleLLM/Juno MacGuff\n",
            "Po  |  Kung-Fu-Panda  |  《功夫熊猫》中的阿宝 | silk-road/ChatHaruhi-from-RoleLLM/Po\n",
            "Professor G.H. Dorr  |  Ladykillers,-The  |  《夫人杀手》中的G.H.多尔教授 | silk-road/ChatHaruhi-from-RoleLLM/Professor G.H. Dorr\n",
            "Fletcher Reede  |  Liar-Liar  |  《撒谎的男人》中的弗莱彻·里德 | silk-road/ChatHaruhi-from-RoleLLM/Fletcher Reede\n",
            "Abraham Lincoln  |  Lincoln  |  《林肯》中的亚伯拉罕·林肯 | silk-road/ChatHaruhi-from-RoleLLM/Abraham Lincoln\n",
            "Frank T.J. Mackey  |  Magnolia  |  《木兰花》中的弗兰克 T.J. 麦凯 | silk-road/ChatHaruhi-from-RoleLLM/Frank T.J. Mackey\n",
            "Malcolm X  |  Malcolm-X  |  《马尔科姆X》中的马尔科姆X | silk-road/ChatHaruhi-from-RoleLLM/Malcolm X\n",
            "Leonard Shelby  |  Memento  |  《记忆碎片》中的伦纳德·谢尔比 | silk-road/ChatHaruhi-from-RoleLLM/Leonard Shelby\n",
            "Harvey Milk  |  Milk  |  《牛奶》中的哈维·牛奶 | silk-road/ChatHaruhi-from-RoleLLM/Harvey Milk\n",
            "Randle McMurphy  |  One-Flew-Over-the-Cuckoo's-Nest  |  《飞越疯人院》中的兰德尔·麦克默菲 | silk-road/ChatHaruhi-from-RoleLLM/Randle McMurphy\n",
            "Jack Sparrow  |  Pirates-of-the-Caribbean-Dead-Man's-Chest  |  《加勒比海盗》中的杰克·斯派洛船长 | silk-road/ChatHaruhi-from-RoleLLM/Jack Sparrow\n",
            "John Dillinger  |  Public-Enemies  |  《公敌》中的约翰·迪林格 | silk-road/ChatHaruhi-from-RoleLLM/John Dillinger\n",
            "Lestat de Lioncourt  |  Queen-of-the-Damned  |  《诅咒女王》中的莱斯塔特·德·莱昂科特 | silk-road/ChatHaruhi-from-RoleLLM/Lestat de Lioncourt\n",
            "Tyler Hawkins  |  Remember-Me  |  《记得我》中的泰勒·霍金斯 | silk-road/ChatHaruhi-from-RoleLLM/Tyler Hawkins\n",
            "Caesar  |  Rise-of-the-Planet-of-the-Apes  |  《猩球崛起》中的凯撒 | silk-road/ChatHaruhi-from-RoleLLM/Caesar\n",
            "Jack  |  Room  |  《房间》中的杰克 | silk-road/ChatHaruhi-from-RoleLLM/Jack\n",
            "James Carter  |  Rush-Hour-2  |  《尖峰时刻2》中的詹姆斯·卡特 | silk-road/ChatHaruhi-from-RoleLLM/James Carter\n",
            "Jigsaw  |  Saw  |  《电锯惊魂》中的拼图杀手 | silk-road/ChatHaruhi-from-RoleLLM/Jigsaw\n",
            "John Doe  |  Se7en  |  《七宗罪》中的约翰·多 | silk-road/ChatHaruhi-from-RoleLLM/John Doe\n",
            "Jackie Moon  |  Semi-Pro  |  《半职业球员》中的杰基·月亮 | silk-road/ChatHaruhi-from-RoleLLM/Jackie Moon\n",
            "Sherlock Holmes  |  Sherlock-Holmes  |  《夏洛克·福尔摩斯》中的夏洛克·福尔摩斯 | silk-road/ChatHaruhi-from-RoleLLM/Sherlock Holmes\n",
            "Shrek  |  Shrek  |  《史莱克》中的史莱克 | silk-road/ChatHaruhi-from-RoleLLM/Shrek\n",
            "Pat Solitano  |  Silver-Linings-Playbook  |  《乌云背后的幸福线》中的帕特·索利塔诺 | silk-road/ChatHaruhi-from-RoleLLM/Pat Solitano\n",
            "Karl Childers  |  Sling-Blade  |  《刀锯》中的卡尔·柴尔德斯 | silk-road/ChatHaruhi-from-RoleLLM/Karl Childers\n",
            "Peter Parker  |  Spider-Man  |  《蜘蛛侠》中的彼得·帕克 | silk-road/ChatHaruhi-from-RoleLLM/Peter Parker\n",
            "Bruno Antony  |  Strangers-on-a-Train  |  《列车上的陌生人》中的布鲁诺·安东尼 | silk-road/ChatHaruhi-from-RoleLLM/Bruno Antony\n",
            "Seth  |  Superbad  |  《超级糟糕》中的塞思 | silk-road/ChatHaruhi-from-RoleLLM/Seth\n",
            "Caden Cotard  |  Synecdoche,-New-York  |  《纽约奇缘》中的卡登·科塔德 | silk-road/ChatHaruhi-from-RoleLLM/Caden Cotard\n",
            "Travis Bickle  |  Taxi-Driver  |  《出租车司机》中的特拉维斯·比克尔 | silk-road/ChatHaruhi-from-RoleLLM/Travis Bickle\n",
            "Stanley Ipkiss  |  Mask,-The  |  《面具》中的斯坦利·伊普基斯 | silk-road/ChatHaruhi-from-RoleLLM/Stanley Ipkiss\n",
            "Lyn Cassady  |  Men-Who-Stare-at-Goats,-The  |  《盯羊的男人》中的林恩·卡萨迪 | silk-road/ChatHaruhi-from-RoleLLM/Lyn Cassady\n",
            "Michael Scott  |  The_Office  |  《办公室》中的迈克尔·斯科特 | silk-road/ChatHaruhi-from-RoleLLM/Michael Scott\n",
            "Robert Angier  |  Prestige,-The  |  《名望》中的罗伯特·安吉尔 | silk-road/ChatHaruhi-from-RoleLLM/Robert Angier\n",
            "Rachel Lang  |  The-Rage-Carrie-2  |  《瑞秋的愤怒：凯丽2》中的瑞秋·朗 | silk-road/ChatHaruhi-from-RoleLLM/Rachel Lang\n",
            "Dr. Frank-N-Furter  |  Rocky-Horror-Picture-Show,-The  |  《洛奇恐怖秀》中的弗兰克·N·福特医生 | silk-road/ChatHaruhi-from-RoleLLM/Dr. Frank-N-Furter\n",
            "Jack Torrance  |  Shining,-The  |  《闪灵》中的杰克·托兰斯 | silk-road/ChatHaruhi-from-RoleLLM/Jack Torrance\n",
            "Tom Ripley  |  Talented-Mr.-Ripley,-The  |  《天才雷普利》中的汤姆·雷普利 | silk-road/ChatHaruhi-from-RoleLLM/Tom Ripley\n",
            "D_Artagnan  |  Three-Musketeers,-The  |  《三剑客》中的达达尼昂 | silk-road/ChatHaruhi-from-RoleLLM/D_Artagnan\n",
            "Stephen Hawking  |  Theory-of-Everything,-The  |  《万物理论》中的斯蒂芬·霍金 | silk-road/ChatHaruhi-from-RoleLLM/Stephen Hawking\n",
            "Thor  |  Thor-Ragnarok  |  《雷神：诸神黄昏》中的雷神索尔 | silk-road/ChatHaruhi-from-RoleLLM/Thor\n",
            "James Bond  |  Tomorrow-Never-Dies  |  《明日帝国》中的詹姆斯·邦德 | silk-road/ChatHaruhi-from-RoleLLM/James Bond\n",
            "Mark Renton  |  Trainspotting  |  《迷幻列车》中的马克·伦顿 | silk-road/ChatHaruhi-from-RoleLLM/Mark Renton\n",
            "Tugg Speedman  |  Tropic-Thunder  |  《热带惊雷》中的塔格·斯皮德曼 | silk-road/ChatHaruhi-from-RoleLLM/Tugg Speedman\n",
            "David Aames  |  Vanilla-Sky  |  《香草天空》中的大卫·艾姆斯 | silk-road/ChatHaruhi-from-RoleLLM/David Aames\n",
            "Rorschach  |  Watchmen  |  《守望者》中的罗夏克 | silk-road/ChatHaruhi-from-RoleLLM/Rorschach\n",
            "Jordan Belfort  |  Wolf-of-Wall-Street,-The  |  《华尔街之狼》中的乔丹·贝尔福特 | silk-road/ChatHaruhi-from-RoleLLM/Jordan Belfort\n",
            "Logan  |  X-Men-Origins-Wolverine  |  《X战警：金刚狼》中的洛根 | silk-road/ChatHaruhi-from-RoleLLM/Logan\n",
            "Judy Hoops  |  Zootopia  |  《疯狂动物城》中的朱迪·胡普斯 | silk-road/ChatHaruhi-from-RoleLLM/Judy Hoops\n",
            "Doctor Who  |  Doctor_Who  |  《神秘博士》中的博士 | silk-road/ChatHaruhi-from-RoleLLM/Doctor Who\n",
            "Blair Waldorf  |  Gossip_Girl  |  《绯闻女孩》中的布莱尔·沃尔多夫 | silk-road/ChatHaruhi-from-RoleLLM/Blair Waldorf\n",
            "Raylan Givens  |  Justified  |  《正当防卫》中的雷兰·吉文斯 | silk-road/ChatHaruhi-from-RoleLLM/Raylan Givens\n",
            "Mary Sibley  |  Salem  |  《塞勒姆》中的玛丽·西布利 | silk-road/ChatHaruhi-from-RoleLLM/Mary Sibley\n",
            "Lucifer Morningstar  |  Lucifer  |  《路西法》中的路西法·晨星 | silk-road/ChatHaruhi-from-RoleLLM/Lucifer Morningstar\n",
            "Sheldon Cooper  |  The_Big_Bang_Theory  |  《生活大爆炸》中的谢尔顿·库珀 | silk-road/ChatHaruhi-from-RoleLLM/Sheldon Cooper\n",
            "Twilight Sparkle  |  My_Little_Pony__Friendship_is_Magic  |  《我的小马驹：友谊之魔》中的暮光星辉 | silk-road/ChatHaruhi-from-RoleLLM/Twilight Sparkle\n",
            "Oliver Queen  |  Arrow  |  《绿箭侠》中的奥利弗·皇后 | silk-road/ChatHaruhi-from-RoleLLM/Oliver Queen\n",
            "Leroy Jethro Gibbs  |  NCIS  |  《海军罪案调查处》中的利洛伊·杰斯罗·吉布斯 | silk-road/ChatHaruhi-from-RoleLLM/Leroy Jethro Gibbs\n",
            "Angel  |  Angel  |  《天使》中的天使 | silk-road/ChatHaruhi-from-RoleLLM/Angel\n",
            "Klaus Mikaelson  |  The_Originals  |  《始祖家族》中的克劳斯·米卡尔森 | silk-road/ChatHaruhi-from-RoleLLM/Klaus Mikaelson\n",
            "Queen Catherine  |  Reign  |  《王权》中的凯瑟琳女王 | silk-road/ChatHaruhi-from-RoleLLM/Queen Catherine\n",
            "Dr. Hannibal Lecter  |  Hannibal  |  《汉尼拔》中的汉尼拔·莱克特医生 | silk-road/ChatHaruhi-from-RoleLLM/Dr. Hannibal Lecter\n",
            "Coach Eric Taylor  |  Friday_Night_Lights  |  《星期五之光》中的教练埃里克·泰勒 | silk-road/ChatHaruhi-from-RoleLLM/Coach Eric Taylor\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "0v905Q0tTapH"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}